1
/* Functions to make fuzzy comparisons between strings
2
Copyright (C) 1988-1989, 1992-1993, 1995, 2001 Free Software Foundation, Inc.
4
This program is free software; you can redistribute it and/or modify
5
it under the terms of the GNU General Public License as published by
6
the Free Software Foundation; either version 2 of the License, or (at
7
your option) any later version.
9
This program is distributed in the hope that it will be useful, but
10
WITHOUT ANY WARRANTY; without even the implied warranty of
11
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
General Public License for more details.
14
You should have received a copy of the GNU General Public License
15
along with this program; if not, write to the Free Software
16
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19
Derived from GNU diff 2.7, analyze.c et al.
21
The basic algorithm is described in:
22
"An O(ND) Difference Algorithm and its Variations", Eugene Myers,
23
Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
24
see especially section 4.2, which describes the variation used below.
26
The basic algorithm was independently discovered as described in:
27
"Algorithms for Approximate String Matching", E. Ukkonen,
28
Information and Control Vol. 64, 1985, pp. 100-118.
30
Modified to work on strings rather than files
31
by Peter Miller <pmiller@agso.gov.au>, October 1995 */
43
# define INT_MAX ((int)(~(unsigned)0 >> 1))
51
* Data on one input string being compared.
55
/* The string to be compared. */
58
/* The length of the string to be compared. */
61
/* The number of characters inserted or deleted. */
65
static struct string_data string[2];
70
/* This corresponds to the diff -H flag. With this heuristic, for
71
strings with a constant small density of changes, the algorithm is
72
linear in the strings size. This is unlikely in typical uses of
73
fstrcmp, and so is usually compiled out. Besides, there is no
74
interface to set it true. */
80
/* Vector, indexed by diagonal, containing 1 + the X coordinate of the
81
point furthest along the given diagonal in the forward search of the
85
/* Vector, indexed by diagonal, containing the X coordinate of the point
86
furthest along the given diagonal in the backward search of the edit
90
/* Edit scripts longer than this are too expensive to compute. */
91
static int too_expensive;
93
/* Snakes bigger than this are considered `big'. */
94
#define SNAKE_LIMIT 20
98
/* Midpoints of this partition. */
101
/* Nonzero if low half will be analyzed minimally. */
104
/* Likewise for high half. */
110
diag - find diagonal path
113
int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
114
struct partition *part);
117
Find the midpoint of the shortest edit script for a specified
118
portion of the two strings.
120
Scan from the beginnings of the strings, and simultaneously from
121
the ends, doing a breadth-first search through the space of
122
edit-sequence. When the two searches meet, we have found the
123
midpoint of the shortest edit sequence.
125
If MINIMAL is nonzero, find the minimal edit script regardless
126
of expense. Otherwise, if the search is too expensive, use
127
heuristics to stop the search and report a suboptimal answer.
130
Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal
131
number XMID - YMID equals the number of inserted characters
132
minus the number of deleted characters (counting only characters
133
before the midpoint). Return the approximate edit cost; this is
134
the total number of characters inserted or deleted (counting
135
only characters before the midpoint), unless a heuristic is used
136
to terminate the search prematurely.
138
Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
139
for the left half of the partition is known; similarly for
143
This function assumes that the first characters of the specified
144
portions of the two strings do not match, and likewise that the
145
last characters do not match. The caller must trim matching
146
characters from the beginning and end of the portions it is
149
If we return the "wrong" partitions, the worst this can do is
150
cause suboptimal diff output. It cannot cause incorrect diff
153
static int diag PARAMS ((int, int, int, int, int, struct partition *));
156
diag (xoff, xlim, yoff, ylim, minimal, part)
162
struct partition *part;
164
int *const fd = fdiag; /* Give the compiler a chance. */
165
int *const bd = bdiag; /* Additional help for the compiler. */
166
const char *const xv = string[0].data; /* Still more help for the compiler. */
167
const char *const yv = string[1].data; /* And more and more . . . */
168
const int dmin = xoff - ylim; /* Minimum valid diagonal. */
169
const int dmax = xlim - yoff; /* Maximum valid diagonal. */
170
const int fmid = xoff - yoff; /* Center diagonal of top-down search. */
171
const int bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
173
int fmax = fmid; /* Limits of top-down search. */
175
int bmax = bmid; /* Limits of bottom-up search. */
177
int odd = (fmid - bmid) & 1;
180
* True if southeast corner is on an odd diagonal with respect
187
int d; /* Active diagonal. */
191
/* Extend the top-down search by an edit step in each diagonal. */
200
for (d = fmax; d >= fmin; d -= 2)
217
while (x < xlim && y < ylim && xv[x] == yv[y])
222
if (x - oldx > SNAKE_LIMIT)
225
if (odd && bmin <= d && d <= bmax && bd[d] <= x)
229
part->lo_minimal = part->hi_minimal = 1;
233
/* Similarly extend the bottom-up search. */
235
bd[--bmin - 1] = INT_MAX;
239
bd[++bmax + 1] = INT_MAX;
242
for (d = bmax; d >= bmin; d -= 2)
258
while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
263
if (oldx - x > SNAKE_LIMIT)
266
if (!odd && fmin <= d && d <= fmax && x <= fd[d])
270
part->lo_minimal = part->hi_minimal = 1;
279
/* Heuristic: check occasionally for a diagonal that has made lots
280
of progress compared with the edit distance. If we have any
281
such, find the one that has made the most progress and return
282
it as if it had succeeded.
284
With this heuristic, for strings with a constant small density
285
of changes, the algorithm is linear in the strings size. */
286
if (c > 200 && big_snake && heuristic)
291
for (d = fmax; d >= fmin; d -= 2)
301
v = (x - xoff) * 2 - dd;
303
if (v > 12 * (c + (dd < 0 ? -dd : dd)))
309
xoff + SNAKE_LIMIT <= x
313
yoff + SNAKE_LIMIT <= y
318
/* We have a good enough best diagonal; now insist
319
that it end with a significant snake. */
322
for (k = 1; xv[x - k] == yv[y - k]; k++)
324
if (k == SNAKE_LIMIT)
337
part->lo_minimal = 1;
338
part->hi_minimal = 0;
342
for (d = bmax; d >= bmin; d -= 2)
352
v = (xlim - x) * 2 + dd;
354
if (v > 12 * (c + (dd < 0 ? -dd : dd)))
356
if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
357
yoff < y && y <= ylim - SNAKE_LIMIT)
359
/* We have a good enough best diagonal; now insist
360
that it end with a significant snake. */
363
for (k = 0; xv[x + k] == yv[y + k]; k++)
365
if (k == SNAKE_LIMIT - 1)
378
part->lo_minimal = 0;
379
part->hi_minimal = 1;
383
#endif /* MINUS_H_FLAG */
385
/* Heuristic: if we've gone well beyond the call of duty, give up
386
and report halfway between our best results so far. */
387
if (c >= too_expensive)
394
/* Pacify `gcc -Wall'. */
398
/* Find forward diagonal that maximizes X + Y. */
400
for (d = fmax; d >= fmin; d -= 2)
405
x = fd[d] < xlim ? fd[d] : xlim;
419
/* Find backward diagonal that minimizes X + Y. */
421
for (d = bmax; d >= bmin; d -= 2)
426
x = xoff > bd[d] ? xoff : bd[d];
440
/* Use the better of the two diagonals. */
441
if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
444
part->ymid = fxybest - fxbest;
445
part->lo_minimal = 1;
446
part->hi_minimal = 0;
451
part->ymid = bxybest - bxbest;
452
part->lo_minimal = 0;
453
part->hi_minimal = 1;
462
compareseq - find edit sequence
465
void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal);
468
Compare in detail contiguous subsequences of the two strings
469
which are known, as a whole, to match each other.
471
The subsequence of string 0 is [XOFF, XLIM) and likewise for
474
Note that XLIM, YLIM are exclusive bounds. All character
475
numbers are origin-0.
477
If MINIMAL is nonzero, find a minimal difference no matter how
480
static void compareseq PARAMS ((int, int, int, int, int));
483
compareseq (xoff, xlim, yoff, ylim, minimal)
490
const char *const xv = string[0].data; /* Help the compiler. */
491
const char *const yv = string[1].data;
493
/* Slide down the bottom initial diagonal. */
494
while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
500
/* Slide up the top initial diagonal. */
501
while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
507
/* Handle simple cases. */
512
++string[1].edit_count;
516
else if (yoff == ylim)
520
++string[0].edit_count;
527
struct partition part;
529
/* Find a point of correspondence in the middle of the strings. */
530
c = diag (xoff, xlim, yoff, ylim, minimal, &part);
534
/* This should be impossible, because it implies that one of
535
the two subsequences is empty, and that case was handled
536
above without calling `diag'. Let's verify that this is
540
/* The two subsequences differ by a single insert or delete;
541
record it and we are done. */
542
if (part.xmid - part.ymid < xoff - yoff)
543
++string[1].edit_count;
545
++string[0].edit_count;
550
/* Use the partitions to split this problem into subproblems. */
551
compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
552
compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
559
fstrcmp - fuzzy string compare
562
double fstrcmp(const char *, const char *);
565
The fstrcmp function may be used to compare two string for
566
similarity. It is very useful in reducing "cascade" or
567
"secondary" errors in compilers or other situations where
571
double; 0 if the strings are entirly dissimilar, 1 if the
572
strings are identical, and a number in between if they are
576
fstrcmp (string1, string2)
583
static int *fdiag_buf;
584
static size_t fdiag_max;
586
/* set the info for each string. */
587
string[0].data = string1;
588
string[0].data_length = strlen (string1);
589
string[1].data = string2;
590
string[1].data_length = strlen (string2);
592
/* short-circuit obvious comparisons */
593
if (string[0].data_length == 0 && string[1].data_length == 0)
595
if (string[0].data_length == 0 || string[1].data_length == 0)
598
/* Set TOO_EXPENSIVE to be approximate square root of input size,
599
bounded below by 256. */
601
for (i = string[0].data_length + string[1].data_length; i != 0; i >>= 2)
603
if (too_expensive < 256)
606
/* Because fstrcmp is typically called multiple times, while scanning
607
symbol tables, etc, attempt to minimize the number of memory
608
allocations performed. Thus, we use a static buffer for the
609
diagonal vectors, and never free them. */
610
fdiag_len = string[0].data_length + string[1].data_length + 3;
611
if (fdiag_len > fdiag_max)
613
fdiag_max = fdiag_len;
614
fdiag_buf = xrealloc (fdiag_buf, fdiag_max * (2 * sizeof (int)));
616
fdiag = fdiag_buf + string[1].data_length + 1;
617
bdiag = fdiag + fdiag_len;
619
/* Now do the main comparison algorithm */
620
string[0].edit_count = 0;
621
string[1].edit_count = 0;
622
compareseq (0, string[0].data_length, 0, string[1].data_length, 0);
625
((number of chars in common) / (average length of the strings)).
626
This is admittedly biased towards finding that the strings are
627
similar, however it does produce meaningful results. */
628
return ((double) (string[0].data_length + string[1].data_length -
629
string[1].edit_count - string[0].edit_count) / (string[0].data_length
630
+ string[1].data_length));