4
* Compute edit distance between two scalar sequences. This class uses
5
* Levenshtein (or Damerau-Levenshtein) to compute the edit distance between
6
* two inputs. The inputs are arrays containing any scalars (not just strings)
7
* so it can be used with, e.g., utf8 sequences.
9
* $matrix = id(new PhutilEditDistanceMatrix())
10
* ->setSequences(str_split('ran'), str_split('rat'));
12
* $cost = $matrix->getEditDistance();
14
* Edit distance computation is slow and requires a large amount of memory;
15
* both are roughly O(N * M) in the length of the strings.
17
* You can customize the cost of insertions, deletions and replacements. By
18
* default, each has a cost of 1.
20
* $matrix->setReplaceCost(1);
22
* By default, transpositions are not evaluated (i.e., the algorithm is
23
* Levenshtein). You can cause transpositions to be evaluated by setting a
24
* transpose cost (which will change the algorithm to Damerau-Levenshtein):
26
* $matrix->setTransposeCost(1);
28
* You can also provide a cost to alter the type of operation being applied.
29
* Many strings have several equivalently expensive edit sequences, but one
30
* some sequences are more readable to humans than others. Providing a small
31
* cost to alter operation type tends to smooth out the sequence and produce
32
* long runs of a single operation, which are generally more readable. For
33
* example, these strings:
38
* ...have edit strings "issis" and "isssi", which describe edit operations with
39
* the same cost, but the latter string is more comprehensible to human viewers.
41
* If you set an alter cost, you must call @{method:setComputeString} to enable
42
* type computation. The alter cost should generally be smaller than `c / N`,
43
* where `c` is the smallest operational cost and `N` is the length of the
44
* longest string. For example, if you are using the default costs (insert = 1,
45
* delete = 1, replace = 1) and computing edit distances for strings of fewer
46
* than 1,000 characters, you might set the alter cost to 0.001.
48
final class PhutilEditDistanceMatrix {
50
private $insertCost = 1;
51
private $deleteCost = 1;
52
private $replaceCost = 1;
53
private $transposeCost = null;
54
private $alterCost = 0;
55
private $maximumLength;
56
private $computeString;
63
private $distanceMatrix = null;
64
private $typeMatrix = null;
66
public function setMaximumLength($maximum_length) {
67
$this->maximumLength = $maximum_length;
71
public function getMaximumLength() {
72
return coalesce($this->maximumLength, $this->getInfinity());
75
public function setComputeString($compute_string) {
76
$this->computeString = $compute_string;
80
public function getComputeString() {
81
return $this->computeString;
84
public function setTransposeCost($transpose_cost) {
85
$this->transposeCost = $transpose_cost;
89
public function getTransposeCost() {
90
return $this->transposeCost;
93
public function setReplaceCost($replace_cost) {
94
$this->replaceCost = $replace_cost;
98
public function getReplaceCost() {
99
return $this->replaceCost;
102
public function setDeleteCost($delete_cost) {
103
$this->deleteCost = $delete_cost;
107
public function getDeleteCost() {
108
return $this->deleteCost;
111
public function setInsertCost($insert_cost) {
112
$this->insertCost = $insert_cost;
116
public function getInsertCost() {
117
return $this->insertCost;
120
public function setAlterCost($alter_cost) {
121
$this->alterCost = $alter_cost;
125
public function getAlterCost() {
126
return $this->alterCost;
129
public function setSequences(array $x, array $y) {
131
// NOTE: We strip common prefixes and suffixes from the inputs because
132
// the runtime of the edit distance algorithm is large and it is common
133
// to diff similar strings.
142
for ($ii = 0; $ii < $l; $ii++) {
143
if ($x[$ii] !== $y[$ii]) {
149
for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) {
150
if ($x[$xl - $ii] !== $y[$yl - $ii]) {
156
$this->prefix = array_slice($x, 0, $prefix_l);
157
$this->suffix = array_slice($x, $xl - $suffix_l);
159
$this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l));
160
$this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l));
161
$this->distanceMatrix = null;
166
private function requireSequences() {
167
if ($this->x === null) {
169
'Call setSequences() before performing useful work!');
173
public function getEditDistance() {
174
$this->requireSequences();
180
return $this->insertCost * count($y);
184
return $this->deleteCost * count($x);
187
$max = $this->getMaximumLength();
188
if (count($x) > $max || count($y) > $max) {
189
return ($this->insertCost * count($y)) + ($this->deleteCost * count($x));
196
$matrix = $this->getDistanceMatrix();
198
return $matrix[count($x)][count($y)];
202
* Return a string representing the edits between the sequences. The string
203
* has these characters:
205
* - s (same): Same character in both strings.
206
* - i (insert): Character inserted.
207
* - d (delete): Character deleted.
208
* - x (replace): Character replaced.
209
* - t (transpose): Character transposed.
211
public function getEditString() {
212
$this->requireSequences();
218
return $this->padEditString('');
222
return $this->padEditString(str_repeat('i', count($y)));
226
return $this->padEditString(str_repeat('d', count($x)));
230
return $this->padEditString(str_repeat('s', count($x)));
233
$max = $this->getMaximumLength();
234
if (count($x) > $max || count($y) > $max) {
235
return $this->padEditString(
236
str_repeat('d', count($x)).
237
str_repeat('i', count($y)));
240
$matrix = $this->getTypeMatrix();
245
$transposes = array();
248
$type = $matrix[$xx][$yy];
250
if (is_array($type)) {
252
$transposes[$type[0]][$type[1]] = true;
258
if (isset($transposes[$xx][$yy])) {
262
if ($type == 's' || $type == 'x') {
265
} else if ($type == 'i') {
267
} else if ($type == 'd') {
270
throw new Exception("Unknown type '{$type}' in type matrix.");
275
if ($xx <= 0 && $yy <= 0) {
280
return $this->padEditString(strrev($str));
283
private function padEditString($str) {
285
$str = str_repeat('s', count($this->prefix)).$str;
288
$str = $str.str_repeat('s', count($this->suffix));
293
private function getTypeMatrix() {
294
if (!$this->computeString) {
296
'Call setComputeString() before getTypeMatrix().');
298
if ($this->typeMatrix === null) {
299
$this->computeMatrix($this->x, $this->y);
301
return $this->typeMatrix;
304
private function getDistanceMatrix() {
305
if ($this->distanceMatrix === null) {
306
$this->computeMatrix($this->x, $this->y);
308
return $this->distanceMatrix;
311
private function computeMatrix(array $x, array $y) {
317
$infinity = $this->getInfinity();
319
$use_damerau = ($this->transposeCost !== null);
321
// Initialize the default cost of a transpose.
322
$m[-1][-1] = $infinity;
324
// Initialize the character position dictionary for Damerau.
333
// Initialize the transpose costs for Damerau.
334
for ($xx = 0; $xx <= $xl; $xx++) {
335
$m[$xx][-1] = $infinity;
338
for ($yy = 0; $yy <= $yl; $yy++) {
339
$m[-1][$yy] = $infinity;
343
// Initialize the top row of the matrix.
344
for ($xx = 0; $xx <= $xl; $xx++) {
345
$m[$xx][0] = $xx * $this->deleteCost;
348
// Initialize the left column of the matrix.
350
for ($yy = 0; $yy <= $yl; $yy++) {
351
$m[0][$yy] = $yy * $this->insertCost;
354
$use_types = ($this->computeString);
357
for ($xx = 0; $xx <= $xl; $xx++) {
360
for ($yy = 0; $yy <= $yl; $yy++) {
366
$alt_cost = $this->getAlterCost();
367
if ($alt_cost && !$use_types) {
369
'If you provide an alter cost with setAlterCost(), you must enable '.
370
'type computation with setComputeStrings().');
373
// Build the edit distance matrix.
374
for ($xx = 1; $xx <= $xl; $xx++) {
376
for ($yy = 1; $yy <= $yl; $yy++) {
378
$dx = $d[$y[$yy - 1]];
382
if ($x[$xx - 1] === $y[$yy - 1]) {
383
$rep_cost = $m[$xx - 1][$yy - 1] + 0;
386
$rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost;
390
$del_cost = $m[$xx - 1][$yy ] + $this->deleteCost;
391
$ins_cost = $m[$xx ][$yy - 1] + $this->insertCost;
393
$del_char = $t[$xx - 1][$yy ];
394
$ins_char = $t[$xx ][$yy - 1];
395
$rep_char = $t[$xx - 1][$yy - 1];
397
if ($del_char !== 'd') {
398
$del_cost += $alt_cost;
400
if ($ins_char !== 'i') {
401
$ins_cost += $alt_cost;
403
if ($rep_char !== $rep_type) {
404
$rep_cost += $alt_cost;
408
if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) {
411
if ($rep_type === 's') {
414
} else if ($ins_cost <= $del_cost) {
423
$trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1);
424
$trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost);
425
if ($trn_cost < $cost) {
427
$type = array($dx + 1, $dy + 1, $type);
431
$m[$xx][$yy] = $cost;
433
$t[$xx][$yy] = $type;
438
$d[$x[$xx - 1]] = ($xx - 1);
442
$this->distanceMatrix = $m;
444
$this->typeMatrix = $t;
448
private function getInfinity() {
452
private function printMatrix(array $m) {
458
foreach (head($m) as $yk => $yv) {
459
printf($p, idx($y, $yk - 1, '-'));
462
foreach ($m as $xk => $xv) {
463
printf($p, idx($x, $xk - 1, '-'));
464
foreach ($xv as $yk => $yv) {
465
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
471
private function printTypeMatrix(array $t) {
477
foreach (head($t) as $yk => $yv) {
478
printf($p, idx($y, $yk - 1, '-'));
481
foreach ($t as $xk => $xv) {
482
printf($p, idx($x, $xk - 1, '-'));
483
foreach ($xv as $yk => $yv) {
484
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));