1
/* Analyze differences between two vectors.
3
Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2010 Free Software
6
This program is free software: you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 3 of the License, or
9
(at your option) any later version.
11
This program is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
GNU General Public License for more details.
16
You should have received a copy of the GNU General Public License
17
along with this program. If not, see <http://www.gnu.org/licenses/>. */
20
/* The basic idea is to consider two vectors as similar if, when
21
transforming the first vector into the second vector through a
22
sequence of edits (inserts and deletes of one element each),
23
this sequence is short - or equivalently, if the ordered list
24
of elements that are untouched by these edits is long. For a
25
good introduction to the subject, read about the "Levenshtein
26
distance" in Wikipedia.
28
The basic algorithm is described in:
29
"An O(ND) Difference Algorithm and its Variations", Eugene Myers,
30
Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
31
see especially section 4.2, which describes the variation used below.
33
The basic algorithm was independently discovered as described in:
34
"Algorithms for Approximate String Matching", E. Ukkonen,
35
Information and Control Vol. 64, 1985, pp. 100-118.
37
Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
38
heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
39
at the price of producing suboptimal output for large inputs with
42
/* Before including this file, you need to define:
43
ELEMENT The element type of the vectors being compared.
44
EQUAL A two-argument macro that tests two elements for
46
OFFSET A signed integer type sufficient to hold the
47
difference between two indices. Usually
48
something like ssize_t.
49
EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
50
NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
51
NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
52
EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
53
early abort of the computation.
54
USE_HEURISTIC (Optional) Define if you want to support the
55
heuristic for large vectors.
56
It is also possible to use this file with abstract arrays. In this case,
57
xvec and yvec are not represented in memory. They only exist conceptually.
58
In this case, the list of defines above is amended as follows:
61
XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
62
A three-argument macro: References xvec[xoff] and
63
yvec[yoff] and tests these elements for equality.
64
Before including this file, you also need to include:
70
/* Maximum value of type OFFSET. */
72
((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
74
/* Default to no early abort. */
76
# define EARLY_ABORT(ctxt) false
79
/* Use this to suppress gcc's `...may be used before initialized' warnings.
80
Beware: The Code argument must not contain commas. */
83
# define IF_LINT(Code) Code
85
# define IF_LINT(Code) /* empty */
89
/* As above, but when Code must contain one comma. */
92
# define IF_LINT2(Code1, Code2) Code1, Code2
94
# define IF_LINT2(Code1, Code2) /* empty */
99
* Context of comparison operation.
104
/* Vectors being compared. */
112
/* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
113
furthest along the given diagonal in the forward search of the edit
117
/* Vector, indexed by diagonal, containing the X coordinate of the point
118
furthest along the given diagonal in the backward search of the edit
123
/* This corresponds to the diff -H flag. With this heuristic, for
124
vectors with a constant small density of changes, the algorithm is
125
linear in the vectors size. */
129
/* Edit scripts longer than this are too expensive to compute. */
130
OFFSET too_expensive;
132
/* Snakes bigger than this are considered `big'. */
133
#define SNAKE_LIMIT 20
138
/* Midpoints of this partition. */
142
/* True if low half will be analyzed minimally. */
145
/* Likewise for high half. */
150
/* Find the midpoint of the shortest edit script for a specified portion
153
Scan from the beginnings of the vectors, and simultaneously from the ends,
154
doing a breadth-first search through the space of edit-sequence.
155
When the two searches meet, we have found the midpoint of the shortest
158
If FIND_MINIMAL is true, find the minimal edit script regardless of
159
expense. Otherwise, if the search is too expensive, use heuristics to
160
stop the search and report a suboptimal answer.
162
Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
163
XMID - YMID equals the number of inserted elements minus the number
164
of deleted elements (counting only elements before the midpoint).
166
Set PART->lo_minimal to true iff the minimal edit script for the
167
left half of the partition is known; similarly for PART->hi_minimal.
169
This function assumes that the first elements of the specified portions
170
of the two vectors do not match, and likewise that the last elements do not
171
match. The caller must trim matching elements from the beginning and end
172
of the portions it is going to specify.
174
If we return the "wrong" partitions, the worst this can do is cause
175
suboptimal diff output. It cannot cause incorrect diff output. */
178
diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
179
struct partition *part, struct context *ctxt)
181
OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
182
OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
184
ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
185
ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
186
#define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
188
#define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
190
const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
191
const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
192
const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
193
const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
195
OFFSET fmax = fmid; /* Limits of top-down search. */
197
OFFSET bmax = bmid; /* Limits of bottom-up search. */
198
OFFSET c; /* Cost. */
199
bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
200
diagonal with respect to the northwest. */
207
OFFSET d; /* Active diagonal. */
208
bool big_snake = false;
210
/* Extend the top-down search by an edit step in each diagonal. */
219
for (d = fmax; d >= fmin; d -= 2)
223
OFFSET tlo = fd[d - 1];
224
OFFSET thi = fd[d + 1];
225
OFFSET x0 = tlo < thi ? thi : tlo + 1;
227
for (x = x0, y = x0 - d;
228
x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
231
if (x - x0 > SNAKE_LIMIT)
234
if (odd && bmin <= d && d <= bmax && bd[d] <= x)
238
part->lo_minimal = part->hi_minimal = true;
243
/* Similarly extend the bottom-up search. */
245
bd[--bmin - 1] = OFFSET_MAX;
249
bd[++bmax + 1] = OFFSET_MAX;
252
for (d = bmax; d >= bmin; d -= 2)
256
OFFSET tlo = bd[d - 1];
257
OFFSET thi = bd[d + 1];
258
OFFSET x0 = tlo < thi ? tlo : thi - 1;
260
for (x = x0, y = x0 - d;
261
xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
264
if (x0 - x > SNAKE_LIMIT)
267
if (!odd && fmin <= d && d <= fmax && x <= fd[d])
271
part->lo_minimal = part->hi_minimal = true;
280
/* Heuristic: check occasionally for a diagonal that has made lots
281
of progress compared with the edit distance. If we have any
282
such, find the one that has made the most progress and return it
283
as if it had succeeded.
285
With this heuristic, for vectors with a constant small density
286
of changes, the algorithm is linear in the vector size. */
288
if (200 < c && big_snake && ctxt->heuristic)
293
for (d = fmax; d >= fmin; d -= 2)
295
OFFSET dd = d - fmid;
298
OFFSET v = (x - xoff) * 2 - dd;
300
if (v > 12 * (c + (dd < 0 ? -dd : dd)))
303
&& xoff + SNAKE_LIMIT <= x && x < xlim
304
&& yoff + SNAKE_LIMIT <= y && y < ylim)
306
/* We have a good enough best diagonal; now insist
307
that it end with a significant snake. */
310
for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
311
if (k == SNAKE_LIMIT)
323
part->lo_minimal = true;
324
part->hi_minimal = false;
332
for (d = bmax; d >= bmin; d -= 2)
334
OFFSET dd = d - bmid;
337
OFFSET v = (xlim - x) * 2 + dd;
339
if (v > 12 * (c + (dd < 0 ? -dd : dd)))
342
&& xoff < x && x <= xlim - SNAKE_LIMIT
343
&& yoff < y && y <= ylim - SNAKE_LIMIT)
345
/* We have a good enough best diagonal; now insist
346
that it end with a significant snake. */
349
for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
350
if (k == SNAKE_LIMIT - 1)
362
part->lo_minimal = false;
363
part->hi_minimal = true;
368
#endif /* USE_HEURISTIC */
370
/* Heuristic: if we've gone well beyond the call of duty, give up
371
and report halfway between our best results so far. */
372
if (c >= ctxt->too_expensive)
375
OFFSET fxbest IF_LINT (= 0);
377
OFFSET bxbest IF_LINT (= 0);
379
/* Find forward diagonal that maximizes X + Y. */
381
for (d = fmax; d >= fmin; d -= 2)
383
OFFSET x = MIN (fd[d], xlim);
397
/* Find backward diagonal that minimizes X + Y. */
398
bxybest = OFFSET_MAX;
399
for (d = bmax; d >= bmin; d -= 2)
401
OFFSET x = MAX (xoff, bd[d]);
415
/* Use the better of the two diagonals. */
416
if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
419
part->ymid = fxybest - fxbest;
420
part->lo_minimal = true;
421
part->hi_minimal = false;
426
part->ymid = bxybest - bxbest;
427
part->lo_minimal = false;
428
part->hi_minimal = true;
433
#undef XREF_YREF_EQUAL
437
/* Compare in detail contiguous subsequences of the two vectors
438
which are known, as a whole, to match each other.
440
The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
442
Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
445
If FIND_MINIMAL, find a minimal difference no matter how
448
The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
450
Return false if terminated normally, or true if terminated through early
454
compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
455
bool find_minimal, struct context *ctxt)
458
ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
459
ELEMENT const *yv = ctxt->yvec;
460
#define XREF_YREF_EQUAL(x,y) EQUAL (xv[x], yv[y])
462
#define XREF_YREF_EQUAL(x,y) XVECREF_YVECREF_EQUAL (ctxt, x, y)
465
/* Slide down the bottom initial diagonal. */
466
while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
472
/* Slide up the top initial diagonal. */
473
while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
479
/* Handle simple cases. */
483
NOTE_INSERT (ctxt, yoff);
484
if (EARLY_ABORT (ctxt))
488
else if (yoff == ylim)
491
NOTE_DELETE (ctxt, xoff);
492
if (EARLY_ABORT (ctxt))
498
struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
500
/* Find a point of correspondence in the middle of the vectors. */
501
diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
503
/* Use the partitions to split this problem into subproblems. */
504
if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
506
if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
511
#undef XREF_YREF_EQUAL
517
#undef EXTRA_CONTEXT_FIELDS
522
#undef XVECREF_YVECREF_EQUAL