~ubuntu-branches/ubuntu/vivid/phabricator/vivid-proposed

2 by Richard Sellam
Initial release (closes: #703046)
1
<?php
2
3
/**
4
 * Compute edit distance between two scalar sequences. This class uses
5
 * Levenshtein (or Damerau-Levenshtein) to compute the edit distance between
6
 * two inputs. The inputs are arrays containing any scalars (not just strings)
7
 * so it can be used with, e.g., utf8 sequences.
8
 *
9
 *  $matrix = id(new PhutilEditDistanceMatrix())
10
 *    ->setSequences(str_split('ran'), str_split('rat'));
11
 *
12
 *  $cost = $matrix->getEditDistance();
13
 *
14
 * Edit distance computation is slow and requires a large amount of memory;
15
 * both are roughly O(N * M) in the length of the strings.
16
 *
17
 * You can customize the cost of insertions, deletions and replacements. By
18
 * default, each has a cost of 1.
19
 *
20
 *   $matrix->setReplaceCost(1);
21
 *
22
 * By default, transpositions are not evaluated (i.e., the algorithm is
23
 * Levenshtein). You can cause transpositions to be evaluated by setting a
24
 * transpose cost (which will change the algorithm to Damerau-Levenshtein):
25
 *
26
 *   $matrix->setTransposeCost(1);
27
 *
28
 * You can also provide a cost to alter the type of operation being applied.
29
 * Many strings have several equivalently expensive edit sequences, but one
30
 * some sequences are more readable to humans than others. Providing a small
31
 * cost to alter operation type tends to smooth out the sequence and produce
32
 * long runs of a single operation, which are generally more readable. For
33
 * example, these strings:
34
 *
35
 *   (x)
36
 *   ((x))
37
 *
38
 * ...have edit strings "issis" and "isssi", which describe edit operations with
39
 * the same cost, but the latter string is more comprehensible to human viewers.
40
 *
41
 * If you set an alter cost, you must call @{method:setComputeString} to enable
42
 * type computation. The alter cost should generally be smaller than `c / N`,
43
 * where `c` is the smallest operational cost and `N` is the length of the
44
 * longest string. For example, if you are using the default costs (insert = 1,
45
 * delete = 1, replace = 1) and computing edit distances for strings of fewer
46
 * than 1,000 characters, you might set the alter cost to 0.001.
47
 */
48
final class PhutilEditDistanceMatrix {
49
50
  private $insertCost    = 1;
51
  private $deleteCost    = 1;
52
  private $replaceCost   = 1;
53
  private $transposeCost = null;
54
  private $alterCost     = 0;
55
  private $maximumLength;
56
  private $computeString;
57
58
  private $x;
59
  private $y;
60
  private $prefix;
61
  private $suffix;
62
63
  private $distanceMatrix = null;
64
  private $typeMatrix = null;
65
66
  public function setMaximumLength($maximum_length) {
67
    $this->maximumLength = $maximum_length;
68
    return $this;
69
  }
70
71
  public function getMaximumLength() {
72
    return coalesce($this->maximumLength, $this->getInfinity());
73
  }
74
75
  public function setComputeString($compute_string) {
76
    $this->computeString = $compute_string;
77
    return $this;
78
  }
79
80
  public function getComputeString() {
81
    return $this->computeString;
82
  }
83
84
  public function setTransposeCost($transpose_cost) {
85
    $this->transposeCost = $transpose_cost;
86
    return $this;
87
  }
88
89
  public function getTransposeCost() {
90
    return $this->transposeCost;
91
  }
92
93
  public function setReplaceCost($replace_cost) {
94
    $this->replaceCost = $replace_cost;
95
    return $this;
96
  }
97
98
  public function getReplaceCost() {
99
    return $this->replaceCost;
100
  }
101
102
  public function setDeleteCost($delete_cost) {
103
    $this->deleteCost = $delete_cost;
104
    return $this;
105
  }
106
107
  public function getDeleteCost() {
108
    return $this->deleteCost;
109
  }
110
111
  public function setInsertCost($insert_cost) {
112
    $this->insertCost = $insert_cost;
113
    return $this;
114
  }
115
116
  public function getInsertCost() {
117
    return $this->insertCost;
118
  }
119
120
  public function setAlterCost($alter_cost) {
121
    $this->alterCost = $alter_cost;
122
    return $this;
123
  }
124
125
  public function getAlterCost() {
126
    return $this->alterCost;
127
  }
128
129
  public function setSequences(array $x, array $y) {
130
131
    // NOTE: We strip common prefixes and suffixes from the inputs because
132
    // the runtime of the edit distance algorithm is large and it is common
133
    // to diff similar strings.
134
135
    $xl = count($x);
136
    $yl = count($y);
137
    $l = min($xl, $yl);
138
139
    $prefix_l = 0;
140
    $suffix_l = 0;
141
142
    for ($ii = 0; $ii < $l; $ii++) {
143
      if ($x[$ii] !== $y[$ii]) {
144
        break;
145
      }
146
      $prefix_l++;
147
    }
148
149
    for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) {
150
      if ($x[$xl - $ii] !== $y[$yl - $ii]) {
151
        break;
152
      }
153
      $suffix_l++;
154
    }
155
156
    $this->prefix = array_slice($x, 0, $prefix_l);
157
    $this->suffix = array_slice($x, $xl - $suffix_l);
158
159
    $this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l));
160
    $this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l));
161
    $this->distanceMatrix = null;
162
163
    return $this;
164
  }
165
166
  private function requireSequences() {
167
    if ($this->x === null) {
168
      throw new Exception(
169
        'Call setSequences() before performing useful work!');
170
    }
171
  }
172
173
  public function getEditDistance() {
174
    $this->requireSequences();
175
176
    $x = $this->x;
177
    $y = $this->y;
178
179
    if (!$x) {
180
      return $this->insertCost * count($y);
181
    }
182
183
    if (!$y) {
184
      return $this->deleteCost * count($x);
185
    }
186
187
    $max = $this->getMaximumLength();
188
    if (count($x) > $max || count($y) > $max) {
189
      return ($this->insertCost * count($y)) + ($this->deleteCost * count($x));
190
    }
191
192
    if ($x === $y) {
193
      return 0;
194
    }
195
196
    $matrix = $this->getDistanceMatrix();
197
198
    return $matrix[count($x)][count($y)];
199
  }
200
201
  /**
202
   * Return a string representing the edits between the sequences. The string
203
   * has these characters:
204
   *
205
   *   - s (same): Same character in both strings.
206
   *   - i (insert): Character inserted.
207
   *   - d (delete): Character deleted.
208
   *   - x (replace): Character replaced.
209
   *   - t (transpose): Character transposed.
210
   */
211
  public function getEditString() {
212
    $this->requireSequences();
213
214
    $x = $this->x;
215
    $y = $this->y;
216
217
    if (!$x && !$y) {
218
      return $this->padEditString('');
219
    }
220
221
    if (!$x) {
222
      return $this->padEditString(str_repeat('i', count($y)));
223
    }
224
225
    if (!$y) {
226
      return $this->padEditString(str_repeat('d', count($x)));
227
    }
228
229
    if ($x === $y) {
230
      return $this->padEditString(str_repeat('s', count($x)));
231
    }
232
233
    $max = $this->getMaximumLength();
234
    if (count($x) > $max || count($y) > $max) {
235
      return $this->padEditString(
236
        str_repeat('d', count($x)).
237
        str_repeat('i', count($y)));
238
    }
239
240
    $matrix = $this->getTypeMatrix();
241
242
    $xx = count($x);
243
    $yy = count($y);
244
245
    $transposes = array();
246
    $str = '';
247
    while (true) {
248
      $type = $matrix[$xx][$yy];
249
250
      if (is_array($type)) {
251
        $chr = 't';
252
        $transposes[$type[0]][$type[1]] = true;
253
        $type = $type[2];
254
      } else {
255
        $chr = $type;
256
      }
257
258
      if (isset($transposes[$xx][$yy])) {
259
        $chr = 't';
260
      }
261
262
      if ($type == 's' || $type == 'x') {
263
        $xx -= 1;
264
        $yy -= 1;
265
      } else if ($type == 'i') {
266
        $yy -= 1;
267
      } else if ($type == 'd') {
268
        $xx -= 1;
269
      } else {
270
        throw new Exception("Unknown type '{$type}' in type matrix.");
271
      }
272
273
      $str .= $chr;
274
275
      if ($xx <= 0 && $yy <= 0) {
276
        break;
277
      }
278
    }
279
280
    return $this->padEditString(strrev($str));
281
  }
282
283
  private function padEditString($str) {
284
    if ($this->prefix) {
285
      $str = str_repeat('s', count($this->prefix)).$str;
286
    }
287
    if ($this->suffix) {
288
      $str = $str.str_repeat('s', count($this->suffix));
289
    }
290
    return $str;
291
  }
292
293
  private function getTypeMatrix() {
294
    if (!$this->computeString) {
295
      throw new Exception(
296
        'Call setComputeString() before getTypeMatrix().');
297
    }
298
    if ($this->typeMatrix === null) {
299
      $this->computeMatrix($this->x, $this->y);
300
    }
301
    return $this->typeMatrix;
302
  }
303
304
  private function getDistanceMatrix() {
305
    if ($this->distanceMatrix === null) {
306
      $this->computeMatrix($this->x, $this->y);
307
    }
308
    return $this->distanceMatrix;
309
  }
310
311
  private function computeMatrix(array $x, array $y) {
312
    $xl = count($x);
313
    $yl = count($y);
314
315
    $m = array();
316
317
    $infinity = $this->getInfinity();
318
319
    $use_damerau = ($this->transposeCost !== null);
320
    if ($use_damerau) {
321
      // Initialize the default cost of a transpose.
322
      $m[-1][-1] = $infinity;
323
324
      // Initialize the character position dictionary for Damerau.
325
      $d = array();
326
      foreach ($x as $c) {
327
        $d[$c] = -1;
328
      }
329
      foreach ($y as $c) {
330
        $d[$c] = -1;
331
      }
332
333
      // Initialize the transpose costs for Damerau.
334
      for ($xx = 0; $xx <= $xl; $xx++) {
335
        $m[$xx][-1] = $infinity;
336
      }
337
338
      for ($yy = 0; $yy <= $yl; $yy++) {
339
        $m[-1][$yy] = $infinity;
340
      }
341
    }
342
343
    // Initialize the top row of the matrix.
344
    for ($xx = 0; $xx <= $xl; $xx++) {
345
      $m[$xx][0] = $xx * $this->deleteCost;
346
    }
347
348
    // Initialize the left column of the matrix.
349
    $cost = 0;
350
    for ($yy = 0; $yy <= $yl; $yy++) {
351
      $m[0][$yy] = $yy * $this->insertCost;
352
    }
353
354
    $use_types = ($this->computeString);
355
    if ($use_types) {
356
      $t = array();
357
      for ($xx = 0; $xx <= $xl; $xx++) {
358
        $t[$xx][0] = 'd';
359
      }
360
      for ($yy = 0; $yy <= $yl; $yy++) {
361
        $t[0][$yy] = 'i';
362
      }
363
      $t[0][0] = 's';
364
    }
365
366
    $alt_cost = $this->getAlterCost();
367
    if ($alt_cost && !$use_types) {
368
      throw new Exception(
369
        'If you provide an alter cost with setAlterCost(), you must enable '.
370
        'type computation with setComputeStrings().');
371
    }
372
373
    // Build the edit distance matrix.
374
    for ($xx = 1; $xx <= $xl; $xx++) {
375
      $last_dy = -1;
376
      for ($yy = 1; $yy <= $yl; $yy++) {
377
        if ($use_damerau) {
378
          $dx = $d[$y[$yy - 1]];
379
          $dy = $last_dy;
380
        }
381
382
        if ($x[$xx - 1] === $y[$yy - 1]) {
383
          $rep_cost = $m[$xx - 1][$yy - 1] + 0;
384
          $rep_type = 's';
385
        } else {
386
          $rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost;
387
          $rep_type = 'x';
388
        }
389
390
        $del_cost = $m[$xx - 1][$yy    ] + $this->deleteCost;
391
        $ins_cost = $m[$xx    ][$yy - 1] + $this->insertCost;
392
        if ($alt_cost) {
393
          $del_char = $t[$xx - 1][$yy    ];
394
          $ins_char = $t[$xx    ][$yy - 1];
395
          $rep_char = $t[$xx - 1][$yy - 1];
396
397
          if ($del_char !== 'd') {
398
            $del_cost += $alt_cost;
399
          }
400
          if ($ins_char !== 'i') {
401
            $ins_cost += $alt_cost;
402
          }
403
          if ($rep_char !== $rep_type) {
404
            $rep_cost += $alt_cost;
405
          }
406
        }
407
408
        if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) {
409
          $cost = $rep_cost;
410
          $type = $rep_type;
411
          if ($rep_type === 's') {
412
            $last_dy = $yy - 1;
413
          }
414
        } else if ($ins_cost <= $del_cost) {
415
          $cost = $ins_cost;
416
          $type = 'i';
417
        } else {
418
          $cost = $del_cost;
419
          $type = 'd';
420
        }
421
422
        if ($use_damerau) {
423
          $trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1);
424
          $trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost);
425
          if ($trn_cost < $cost) {
426
            $cost = $trn_cost;
427
            $type = array($dx + 1, $dy + 1, $type);
428
          }
429
        }
430
431
        $m[$xx][$yy] = $cost;
432
        if ($use_types) {
433
          $t[$xx][$yy] = $type;
434
        }
435
      }
436
437
      if ($use_damerau) {
438
        $d[$x[$xx - 1]] = ($xx - 1);
439
      }
440
    }
441
442
    $this->distanceMatrix = $m;
443
    if ($use_types) {
444
      $this->typeMatrix = $t;
445
    }
446
  }
447
448
  private function getInfinity() {
449
    return INF;
450
  }
451
452
  private function printMatrix(array $m) {
453
    $x = $this->x;
454
    $y = $this->y;
455
456
    $p = '% 8s ';
457
    printf($p, ' ');
458
    foreach (head($m) as $yk => $yv) {
459
      printf($p, idx($y, $yk - 1, '-'));
460
    }
461
    echo "\n";
462
    foreach ($m as $xk => $xv) {
463
      printf($p, idx($x, $xk - 1, '-'));
464
      foreach ($xv as $yk => $yv) {
465
        printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
466
      }
467
      echo "\n";
468
    }
469
  }
470
471
  private function printTypeMatrix(array $t) {
472
    $x = $this->x;
473
    $y = $this->y;
474
475
    $p = '% 8s ';
476
    printf($p, ' ');
477
    foreach (head($t) as $yk => $yv) {
478
      printf($p, idx($y, $yk - 1, '-'));
479
    }
480
    echo "\n";
481
    foreach ($t as $xk => $xv) {
482
      printf($p, idx($x, $xk - 1, '-'));
483
      foreach ($xv as $yk => $yv) {
484
        printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
485
      }
486
      echo "\n";
487
    }
488
  }
489
490
}