2
by Richard Sellam
Initial release (closes: #703046) |
1 |
<?php
|
2 |
||
3 |
/**
|
|
4 |
* Compute edit distance between two scalar sequences. This class uses
|
|
5 |
* Levenshtein (or Damerau-Levenshtein) to compute the edit distance between
|
|
6 |
* two inputs. The inputs are arrays containing any scalars (not just strings)
|
|
7 |
* so it can be used with, e.g., utf8 sequences.
|
|
8 |
*
|
|
9 |
* $matrix = id(new PhutilEditDistanceMatrix())
|
|
10 |
* ->setSequences(str_split('ran'), str_split('rat'));
|
|
11 |
*
|
|
12 |
* $cost = $matrix->getEditDistance();
|
|
13 |
*
|
|
14 |
* Edit distance computation is slow and requires a large amount of memory;
|
|
15 |
* both are roughly O(N * M) in the length of the strings.
|
|
16 |
*
|
|
17 |
* You can customize the cost of insertions, deletions and replacements. By
|
|
18 |
* default, each has a cost of 1.
|
|
19 |
*
|
|
20 |
* $matrix->setReplaceCost(1);
|
|
21 |
*
|
|
22 |
* By default, transpositions are not evaluated (i.e., the algorithm is
|
|
23 |
* Levenshtein). You can cause transpositions to be evaluated by setting a
|
|
24 |
* transpose cost (which will change the algorithm to Damerau-Levenshtein):
|
|
25 |
*
|
|
26 |
* $matrix->setTransposeCost(1);
|
|
27 |
*
|
|
28 |
* You can also provide a cost to alter the type of operation being applied.
|
|
29 |
* Many strings have several equivalently expensive edit sequences, but one
|
|
30 |
* some sequences are more readable to humans than others. Providing a small
|
|
31 |
* cost to alter operation type tends to smooth out the sequence and produce
|
|
32 |
* long runs of a single operation, which are generally more readable. For
|
|
33 |
* example, these strings:
|
|
34 |
*
|
|
35 |
* (x)
|
|
36 |
* ((x))
|
|
37 |
*
|
|
38 |
* ...have edit strings "issis" and "isssi", which describe edit operations with
|
|
39 |
* the same cost, but the latter string is more comprehensible to human viewers.
|
|
40 |
*
|
|
41 |
* If you set an alter cost, you must call @{method:setComputeString} to enable
|
|
42 |
* type computation. The alter cost should generally be smaller than `c / N`,
|
|
43 |
* where `c` is the smallest operational cost and `N` is the length of the
|
|
44 |
* longest string. For example, if you are using the default costs (insert = 1,
|
|
45 |
* delete = 1, replace = 1) and computing edit distances for strings of fewer
|
|
46 |
* than 1,000 characters, you might set the alter cost to 0.001.
|
|
47 |
*/
|
|
48 |
final class PhutilEditDistanceMatrix { |
|
49 |
||
50 |
private $insertCost = 1; |
|
51 |
private $deleteCost = 1; |
|
52 |
private $replaceCost = 1; |
|
53 |
private $transposeCost = null; |
|
54 |
private $alterCost = 0; |
|
55 |
private $maximumLength; |
|
56 |
private $computeString; |
|
57 |
||
58 |
private $x; |
|
59 |
private $y; |
|
60 |
private $prefix; |
|
61 |
private $suffix; |
|
62 |
||
63 |
private $distanceMatrix = null; |
|
64 |
private $typeMatrix = null; |
|
65 |
||
66 |
public function setMaximumLength($maximum_length) { |
|
67 |
$this->maximumLength = $maximum_length; |
|
68 |
return $this; |
|
69 |
}
|
|
70 |
||
71 |
public function getMaximumLength() { |
|
72 |
return coalesce($this->maximumLength, $this->getInfinity()); |
|
73 |
}
|
|
74 |
||
75 |
public function setComputeString($compute_string) { |
|
76 |
$this->computeString = $compute_string; |
|
77 |
return $this; |
|
78 |
}
|
|
79 |
||
80 |
public function getComputeString() { |
|
81 |
return $this->computeString; |
|
82 |
}
|
|
83 |
||
84 |
public function setTransposeCost($transpose_cost) { |
|
85 |
$this->transposeCost = $transpose_cost; |
|
86 |
return $this; |
|
87 |
}
|
|
88 |
||
89 |
public function getTransposeCost() { |
|
90 |
return $this->transposeCost; |
|
91 |
}
|
|
92 |
||
93 |
public function setReplaceCost($replace_cost) { |
|
94 |
$this->replaceCost = $replace_cost; |
|
95 |
return $this; |
|
96 |
}
|
|
97 |
||
98 |
public function getReplaceCost() { |
|
99 |
return $this->replaceCost; |
|
100 |
}
|
|
101 |
||
102 |
public function setDeleteCost($delete_cost) { |
|
103 |
$this->deleteCost = $delete_cost; |
|
104 |
return $this; |
|
105 |
}
|
|
106 |
||
107 |
public function getDeleteCost() { |
|
108 |
return $this->deleteCost; |
|
109 |
}
|
|
110 |
||
111 |
public function setInsertCost($insert_cost) { |
|
112 |
$this->insertCost = $insert_cost; |
|
113 |
return $this; |
|
114 |
}
|
|
115 |
||
116 |
public function getInsertCost() { |
|
117 |
return $this->insertCost; |
|
118 |
}
|
|
119 |
||
120 |
public function setAlterCost($alter_cost) { |
|
121 |
$this->alterCost = $alter_cost; |
|
122 |
return $this; |
|
123 |
}
|
|
124 |
||
125 |
public function getAlterCost() { |
|
126 |
return $this->alterCost; |
|
127 |
}
|
|
128 |
||
129 |
public function setSequences(array $x, array $y) { |
|
130 |
||
131 |
// NOTE: We strip common prefixes and suffixes from the inputs because
|
|
132 |
// the runtime of the edit distance algorithm is large and it is common
|
|
133 |
// to diff similar strings.
|
|
134 |
||
135 |
$xl = count($x); |
|
136 |
$yl = count($y); |
|
137 |
$l = min($xl, $yl); |
|
138 |
||
139 |
$prefix_l = 0; |
|
140 |
$suffix_l = 0; |
|
141 |
||
142 |
for ($ii = 0; $ii < $l; $ii++) { |
|
143 |
if ($x[$ii] !== $y[$ii]) { |
|
144 |
break; |
|
145 |
}
|
|
146 |
$prefix_l++; |
|
147 |
}
|
|
148 |
||
149 |
for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) { |
|
150 |
if ($x[$xl - $ii] !== $y[$yl - $ii]) { |
|
151 |
break; |
|
152 |
}
|
|
153 |
$suffix_l++; |
|
154 |
}
|
|
155 |
||
156 |
$this->prefix = array_slice($x, 0, $prefix_l); |
|
157 |
$this->suffix = array_slice($x, $xl - $suffix_l); |
|
158 |
||
159 |
$this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l)); |
|
160 |
$this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l)); |
|
161 |
$this->distanceMatrix = null; |
|
162 |
||
163 |
return $this; |
|
164 |
}
|
|
165 |
||
166 |
private function requireSequences() { |
|
167 |
if ($this->x === null) { |
|
168 |
throw new Exception( |
|
169 |
'Call setSequences() before performing useful work!'); |
|
170 |
}
|
|
171 |
}
|
|
172 |
||
173 |
public function getEditDistance() { |
|
174 |
$this->requireSequences(); |
|
175 |
||
176 |
$x = $this->x; |
|
177 |
$y = $this->y; |
|
178 |
||
179 |
if (!$x) { |
|
180 |
return $this->insertCost * count($y); |
|
181 |
}
|
|
182 |
||
183 |
if (!$y) { |
|
184 |
return $this->deleteCost * count($x); |
|
185 |
}
|
|
186 |
||
187 |
$max = $this->getMaximumLength(); |
|
188 |
if (count($x) > $max || count($y) > $max) { |
|
189 |
return ($this->insertCost * count($y)) + ($this->deleteCost * count($x)); |
|
190 |
}
|
|
191 |
||
192 |
if ($x === $y) { |
|
193 |
return 0; |
|
194 |
}
|
|
195 |
||
196 |
$matrix = $this->getDistanceMatrix(); |
|
197 |
||
198 |
return $matrix[count($x)][count($y)]; |
|
199 |
}
|
|
200 |
||
201 |
/**
|
|
202 |
* Return a string representing the edits between the sequences. The string
|
|
203 |
* has these characters:
|
|
204 |
*
|
|
205 |
* - s (same): Same character in both strings.
|
|
206 |
* - i (insert): Character inserted.
|
|
207 |
* - d (delete): Character deleted.
|
|
208 |
* - x (replace): Character replaced.
|
|
209 |
* - t (transpose): Character transposed.
|
|
210 |
*/
|
|
211 |
public function getEditString() { |
|
212 |
$this->requireSequences(); |
|
213 |
||
214 |
$x = $this->x; |
|
215 |
$y = $this->y; |
|
216 |
||
217 |
if (!$x && !$y) { |
|
218 |
return $this->padEditString(''); |
|
219 |
}
|
|
220 |
||
221 |
if (!$x) { |
|
222 |
return $this->padEditString(str_repeat('i', count($y))); |
|
223 |
}
|
|
224 |
||
225 |
if (!$y) { |
|
226 |
return $this->padEditString(str_repeat('d', count($x))); |
|
227 |
}
|
|
228 |
||
229 |
if ($x === $y) { |
|
230 |
return $this->padEditString(str_repeat('s', count($x))); |
|
231 |
}
|
|
232 |
||
233 |
$max = $this->getMaximumLength(); |
|
234 |
if (count($x) > $max || count($y) > $max) { |
|
235 |
return $this->padEditString( |
|
236 |
str_repeat('d', count($x)). |
|
237 |
str_repeat('i', count($y))); |
|
238 |
}
|
|
239 |
||
240 |
$matrix = $this->getTypeMatrix(); |
|
241 |
||
242 |
$xx = count($x); |
|
243 |
$yy = count($y); |
|
244 |
||
245 |
$transposes = array(); |
|
246 |
$str = ''; |
|
247 |
while (true) { |
|
248 |
$type = $matrix[$xx][$yy]; |
|
249 |
||
250 |
if (is_array($type)) { |
|
251 |
$chr = 't'; |
|
252 |
$transposes[$type[0]][$type[1]] = true; |
|
253 |
$type = $type[2]; |
|
254 |
} else { |
|
255 |
$chr = $type; |
|
256 |
}
|
|
257 |
||
258 |
if (isset($transposes[$xx][$yy])) { |
|
259 |
$chr = 't'; |
|
260 |
}
|
|
261 |
||
262 |
if ($type == 's' || $type == 'x') { |
|
263 |
$xx -= 1; |
|
264 |
$yy -= 1; |
|
265 |
} else if ($type == 'i') { |
|
266 |
$yy -= 1; |
|
267 |
} else if ($type == 'd') { |
|
268 |
$xx -= 1; |
|
269 |
} else { |
|
270 |
throw new Exception("Unknown type '{$type}' in type matrix."); |
|
271 |
}
|
|
272 |
||
273 |
$str .= $chr; |
|
274 |
||
275 |
if ($xx <= 0 && $yy <= 0) { |
|
276 |
break; |
|
277 |
}
|
|
278 |
}
|
|
279 |
||
280 |
return $this->padEditString(strrev($str)); |
|
281 |
}
|
|
282 |
||
283 |
private function padEditString($str) { |
|
284 |
if ($this->prefix) { |
|
285 |
$str = str_repeat('s', count($this->prefix)).$str; |
|
286 |
}
|
|
287 |
if ($this->suffix) { |
|
288 |
$str = $str.str_repeat('s', count($this->suffix)); |
|
289 |
}
|
|
290 |
return $str; |
|
291 |
}
|
|
292 |
||
293 |
private function getTypeMatrix() { |
|
294 |
if (!$this->computeString) { |
|
295 |
throw new Exception( |
|
296 |
'Call setComputeString() before getTypeMatrix().'); |
|
297 |
}
|
|
298 |
if ($this->typeMatrix === null) { |
|
299 |
$this->computeMatrix($this->x, $this->y); |
|
300 |
}
|
|
301 |
return $this->typeMatrix; |
|
302 |
}
|
|
303 |
||
304 |
private function getDistanceMatrix() { |
|
305 |
if ($this->distanceMatrix === null) { |
|
306 |
$this->computeMatrix($this->x, $this->y); |
|
307 |
}
|
|
308 |
return $this->distanceMatrix; |
|
309 |
}
|
|
310 |
||
311 |
private function computeMatrix(array $x, array $y) { |
|
312 |
$xl = count($x); |
|
313 |
$yl = count($y); |
|
314 |
||
315 |
$m = array(); |
|
316 |
||
317 |
$infinity = $this->getInfinity(); |
|
318 |
||
319 |
$use_damerau = ($this->transposeCost !== null); |
|
320 |
if ($use_damerau) { |
|
321 |
// Initialize the default cost of a transpose.
|
|
322 |
$m[-1][-1] = $infinity; |
|
323 |
||
324 |
// Initialize the character position dictionary for Damerau.
|
|
325 |
$d = array(); |
|
326 |
foreach ($x as $c) { |
|
327 |
$d[$c] = -1; |
|
328 |
}
|
|
329 |
foreach ($y as $c) { |
|
330 |
$d[$c] = -1; |
|
331 |
}
|
|
332 |
||
333 |
// Initialize the transpose costs for Damerau.
|
|
334 |
for ($xx = 0; $xx <= $xl; $xx++) { |
|
335 |
$m[$xx][-1] = $infinity; |
|
336 |
}
|
|
337 |
||
338 |
for ($yy = 0; $yy <= $yl; $yy++) { |
|
339 |
$m[-1][$yy] = $infinity; |
|
340 |
}
|
|
341 |
}
|
|
342 |
||
343 |
// Initialize the top row of the matrix.
|
|
344 |
for ($xx = 0; $xx <= $xl; $xx++) { |
|
345 |
$m[$xx][0] = $xx * $this->deleteCost; |
|
346 |
}
|
|
347 |
||
348 |
// Initialize the left column of the matrix.
|
|
349 |
$cost = 0; |
|
350 |
for ($yy = 0; $yy <= $yl; $yy++) { |
|
351 |
$m[0][$yy] = $yy * $this->insertCost; |
|
352 |
}
|
|
353 |
||
354 |
$use_types = ($this->computeString); |
|
355 |
if ($use_types) { |
|
356 |
$t = array(); |
|
357 |
for ($xx = 0; $xx <= $xl; $xx++) { |
|
358 |
$t[$xx][0] = 'd'; |
|
359 |
}
|
|
360 |
for ($yy = 0; $yy <= $yl; $yy++) { |
|
361 |
$t[0][$yy] = 'i'; |
|
362 |
}
|
|
363 |
$t[0][0] = 's'; |
|
364 |
}
|
|
365 |
||
366 |
$alt_cost = $this->getAlterCost(); |
|
367 |
if ($alt_cost && !$use_types) { |
|
368 |
throw new Exception( |
|
369 |
'If you provide an alter cost with setAlterCost(), you must enable '. |
|
370 |
'type computation with setComputeStrings().'); |
|
371 |
}
|
|
372 |
||
373 |
// Build the edit distance matrix.
|
|
374 |
for ($xx = 1; $xx <= $xl; $xx++) { |
|
375 |
$last_dy = -1; |
|
376 |
for ($yy = 1; $yy <= $yl; $yy++) { |
|
377 |
if ($use_damerau) { |
|
378 |
$dx = $d[$y[$yy - 1]]; |
|
379 |
$dy = $last_dy; |
|
380 |
}
|
|
381 |
||
382 |
if ($x[$xx - 1] === $y[$yy - 1]) { |
|
383 |
$rep_cost = $m[$xx - 1][$yy - 1] + 0; |
|
384 |
$rep_type = 's'; |
|
385 |
} else { |
|
386 |
$rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost; |
|
387 |
$rep_type = 'x'; |
|
388 |
}
|
|
389 |
||
390 |
$del_cost = $m[$xx - 1][$yy ] + $this->deleteCost; |
|
391 |
$ins_cost = $m[$xx ][$yy - 1] + $this->insertCost; |
|
392 |
if ($alt_cost) { |
|
393 |
$del_char = $t[$xx - 1][$yy ]; |
|
394 |
$ins_char = $t[$xx ][$yy - 1]; |
|
395 |
$rep_char = $t[$xx - 1][$yy - 1]; |
|
396 |
||
397 |
if ($del_char !== 'd') { |
|
398 |
$del_cost += $alt_cost; |
|
399 |
}
|
|
400 |
if ($ins_char !== 'i') { |
|
401 |
$ins_cost += $alt_cost; |
|
402 |
}
|
|
403 |
if ($rep_char !== $rep_type) { |
|
404 |
$rep_cost += $alt_cost; |
|
405 |
}
|
|
406 |
}
|
|
407 |
||
408 |
if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) { |
|
409 |
$cost = $rep_cost; |
|
410 |
$type = $rep_type; |
|
411 |
if ($rep_type === 's') { |
|
412 |
$last_dy = $yy - 1; |
|
413 |
}
|
|
414 |
} else if ($ins_cost <= $del_cost) { |
|
415 |
$cost = $ins_cost; |
|
416 |
$type = 'i'; |
|
417 |
} else { |
|
418 |
$cost = $del_cost; |
|
419 |
$type = 'd'; |
|
420 |
}
|
|
421 |
||
422 |
if ($use_damerau) { |
|
423 |
$trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1); |
|
424 |
$trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost); |
|
425 |
if ($trn_cost < $cost) { |
|
426 |
$cost = $trn_cost; |
|
427 |
$type = array($dx + 1, $dy + 1, $type); |
|
428 |
}
|
|
429 |
}
|
|
430 |
||
431 |
$m[$xx][$yy] = $cost; |
|
432 |
if ($use_types) { |
|
433 |
$t[$xx][$yy] = $type; |
|
434 |
}
|
|
435 |
}
|
|
436 |
||
437 |
if ($use_damerau) { |
|
438 |
$d[$x[$xx - 1]] = ($xx - 1); |
|
439 |
}
|
|
440 |
}
|
|
441 |
||
442 |
$this->distanceMatrix = $m; |
|
443 |
if ($use_types) { |
|
444 |
$this->typeMatrix = $t; |
|
445 |
}
|
|
446 |
}
|
|
447 |
||
448 |
private function getInfinity() { |
|
449 |
return INF; |
|
450 |
}
|
|
451 |
||
452 |
private function printMatrix(array $m) { |
|
453 |
$x = $this->x; |
|
454 |
$y = $this->y; |
|
455 |
||
456 |
$p = '% 8s '; |
|
457 |
printf($p, ' '); |
|
458 |
foreach (head($m) as $yk => $yv) { |
|
459 |
printf($p, idx($y, $yk - 1, '-')); |
|
460 |
}
|
|
461 |
echo "\n"; |
|
462 |
foreach ($m as $xk => $xv) { |
|
463 |
printf($p, idx($x, $xk - 1, '-')); |
|
464 |
foreach ($xv as $yk => $yv) { |
|
465 |
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv)); |
|
466 |
}
|
|
467 |
echo "\n"; |
|
468 |
}
|
|
469 |
}
|
|
470 |
||
471 |
private function printTypeMatrix(array $t) { |
|
472 |
$x = $this->x; |
|
473 |
$y = $this->y; |
|
474 |
||
475 |
$p = '% 8s '; |
|
476 |
printf($p, ' '); |
|
477 |
foreach (head($t) as $yk => $yv) { |
|
478 |
printf($p, idx($y, $yk - 1, '-')); |
|
479 |
}
|
|
480 |
echo "\n"; |
|
481 |
foreach ($t as $xk => $xv) { |
|
482 |
printf($p, idx($x, $xk - 1, '-')); |
|
483 |
foreach ($xv as $yk => $yv) { |
|
484 |
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv)); |
|
485 |
}
|
|
486 |
echo "\n"; |
|
487 |
}
|
|
488 |
}
|
|
489 |
||
490 |
}
|