5
// Matthias Hertel, http://www.mathertel.de//
6
// some tweaks made by Mike KrĆ¼ger <mkrueger@novell.com>
8
// Copyright (c) by Matthias Hertel, http://www.mathertel.de//
10
// Permission is hereby granted, free of charge, to any person obtaining a copy
11
// of this software and associated documentation files (the "Software"), to deal
12
// in the Software without restriction, including without limitation the rights
13
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
// copies of the Software, and to permit persons to whom the Software is
15
// furnished to do so, subject to the following conditions:
17
// The above copyright notice and this permission notice shall be included in
18
// all copies or substantial portions of the Software.
20
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
28
// diff.cs: A port of the algorythm to C#
29
// Copyright (c) by Matthias Hertel, http://www.mathertel.de
30
// This work is licensed under a BSD style license. See http://www.mathertel.de/License.aspx
32
// This Class implements the Difference Algorithm published in
33
// "An O(ND) Difference Algorithm and its Variations" by Eugene Myers
34
// Algorithmica Vol. 1 No. 2, 1986, p 251.
36
// There are many C, Java, Lisp implementations public available but they all seem to come
37
// from the same source (diffutils) that is under the (unfree) GNU public License
38
// and cannot be reused as a sourcecode for a commercial application.
39
// There are very old C implementations that use other (worse) algorithms.
40
// Microsoft also published sourcecode of a diff-tool (windiff) that uses some tree data.
41
// Also, a direct transfer from a C source to C# is not easy because there is a lot of pointer
42
// arithmetic in the typical C solutions and i need a managed solution.
43
// These are the reasons why I implemented the original published algorithm from the scratch and
44
// make it avaliable without the GNU license limitations.
45
// I do not need a high performance diff tool because it is used only sometimes.
46
// I will do some performace tweaking when needed.
48
// The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents
49
// each line is converted into a (hash) number. See DiffText().
51
// Some chages to the original algorithm:
52
// The original algorithm was described using a recursive approach and comparing zero indexed arrays.
53
// Extracting sub-arrays and rejoining them is very performance and memory intensive so the same
54
// (readonly) data arrays are passed arround together with their lower and upper bounds.
55
// This circumstance makes the LCS and SMS functions more complicate.
56
// I added some code to the LCS function to get a fast response on sub-arrays that are identical,
57
// completely deleted or inserted.
59
// The result from a comparisation is stored in 2 arrays that flag for modified (deleted or inserted)
60
// lines in the 2 data arrays. These bits are then analysed to produce a array of Hunk objects.
62
// Further possible optimizations:
63
// (first rule: don't do it; second: don't do it yet)
64
// The arrays DataA and DataB are passed as parameters, but are never changed after the creation
65
// so they can be members of the class to avoid the paramter overhead.
66
// In SMS is a lot of boundary arithmetic in the for-D and for-k loops that can be done by increment
67
// and decrement of local variables.
68
// The DownVector and UpVector arrays are alywas created and destroyed each time the SMS gets called.
69
// It is possible to reuse tehm when transfering them to members of the class.
73
// 2002.09.20 There was a "hang" in some situations.
74
// Now I undestand a little bit more of the SMS algorithm.
75
// There have been overlapping boxes; that where analyzed partial differently.
76
// One return-point is enough.
77
// A assertion was added in CreateDiffs when in debug-mode, that counts the number of equal (no modified) lines in both arrays.
78
// They must be identical.
80
// 2003.02.07 Out of bounds error in the Up/Down vector arrays in some situations.
81
// The two vetors are now accessed using different offsets that are adjusted using the start k-Line.
82
// A test case is added.
84
// 2006.03.05 Some documentation and a direct Diff entry point.
86
// 2006.03.08 Refactored the API to static methods on the Diff class to make usage simpler.
87
// 2006.03.10 using the standard Debug class for self-test now.
88
// compile with: csc /target:exe /out:diffTest.exe /d:DEBUG /d:TRACE /d:SELFTEST Diff.cs
89
// 2007.01.06 license agreement changed to a BSD style license.
90
// 2007.06.03 added the Optimize method.
91
// 2007.09.23 UpVector and DownVector optimization by Jan Stoklasa ().
92
// 2008.05.31 Adjusted the testing code that failed because of the Optimize method (not a bug in the diff algorithm).
93
// 2008.10.08 Fixing a test case and adding a new test case.
96
using System.Collections.Generic;
98
using System.Text.RegularExpressions;
100
namespace Mono.TextEditor.Utils
104
public static readonly Hunk Empty = new Hunk (0, 0, 0, 0);
106
public bool IsEmpty {
108
return InsertStart <= 0;
112
public readonly int InsertStart;
113
public readonly int RemoveStart;
115
public readonly int Removed;
116
public readonly int Inserted;
118
public Hunk (int removeStart, int insertStart, int removed, int inserted)
120
this.InsertStart = insertStart;
121
this.RemoveStart = removeStart;
122
this.Removed = removed;
123
this.Inserted = inserted;
126
public static bool operator ==(Hunk left, Hunk right)
128
return left.InsertStart == right.InsertStart && left.RemoveStart == right.RemoveStart &&
129
left.Removed == right.Removed && left.Inserted == right.Inserted;
132
public static bool operator !=(Hunk left, Hunk right)
134
return !(left == right);
137
public override bool Equals (object obj)
141
return ((Hunk)obj) == this;
144
public override int GetHashCode ()
146
return InsertStart ^ RemoveStart ^ Inserted ^ Removed;
149
public override string ToString ()
152
return"[Hunk: Empty]";
153
return string.Format ("[Hunk: InsertStart={0}, RemoveStart={1}, Removed={2}, Inserted={3}]", InsertStart, RemoveStart, Removed, Inserted);
157
public sealed class Diff
160
/// Shortest Middle Snake Return Data
167
static void Optimize<T> (DiffData<T> data)
170
while (startPos < data.Length) {
171
while (startPos < data.Length && data.Modified[startPos] == false)
173
int endPos = startPos;
174
while (endPos < data.Length && data.Modified[endPos] == true)
177
if (endPos < data.Length && data.Data[startPos].Equals (data.Data[endPos])) {
178
data.Modified[startPos] = false;
179
data.Modified[endPos] = true;
186
public static IEnumerable<Hunk> CharDiff (string left, string right)
188
return GetDiff (left != null ? left.ToCharArray () : new char[0], right != null ? right.ToCharArray () : new char[0]);
191
public static IEnumerable<Hunk> GetDiff<T> (T[] baseArray, T[] changedArray)
193
// The A-Version of the data (original data) to be compared.
194
var dataA = new DiffData<T> (baseArray);
196
// The B-Version of the data (modified data) to be compared.
197
var dataB = new DiffData<T> (changedArray);
199
int MAX = dataA.Length + dataB.Length + 1;
200
/// vector for the (0,0) to (x,y) search
201
int[] downVector = new int[2 * MAX + 2];
202
/// vector for the (u,v) to (N,M) search
203
int[] upVector = new int[2 * MAX + 2];
205
LCS (dataA, 0, dataA.Length, dataB, 0, dataB.Length, downVector, upVector);
206
return CreateDiffs (dataA, dataB);
209
/// <summary>Scan the tables of which lines are inserted and deleted,
210
/// producing an edit script in forward order.
213
static IEnumerable<Hunk> CreateDiffs<T> (DiffData<T> baseData, DiffData<T> changedData)
217
while (lineA < baseData.Length || lineB < changedData
219
if (lineA < baseData.Length && !baseData.Modified[lineA] && lineB < changedData
220
.Length && !changedData
227
// maybe deleted and/or inserted lines
231
while (lineA < baseData.Length && (lineB >= changedData
232
.Length || baseData.Modified[lineA]))
233
// while (LineA < DataA.Length && DataA.Modified[LineA])
236
while (lineB < changedData
237
.Length && (lineA >= baseData.Length || changedData
239
// while (LineB < DataB.Length && DataB.Modified[LineB])
242
if (startA < lineA || startB < lineB) {
243
// store a new difference-item
244
yield return new Hunk (startA + 1, startB + 1, lineA - startA, lineB - startB);
254
/// This is the algorithm to find the Shortest Middle Snake (SMS).
256
/// <param name="dataA">sequence A</param>
257
/// <param name="lowerA">lower bound of the actual range in DataA</param>
258
/// <param name="upperA">upper bound of the actual range in DataA (exclusive)</param>
259
/// <param name="dataB">sequence B</param>
260
/// <param name="lowerB">lower bound of the actual range in DataB</param>
261
/// <param name="upperB">upper bound of the actual range in DataB (exclusive)</param>
262
/// <param name="downVector">a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.</param>
263
/// <param name="upVector">a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.</param>
264
/// <returns>a MiddleSnakeData record containing x,y and u,v</returns>
265
static SMSRD SMS<T> (DiffData<T> dataA, int lowerA, int upperA, DiffData<T> dataB, int lowerB, int upperB, int[] downVector, int[] upVector)
268
int MAX = dataA.Length + dataB.Length + 1;
270
int downK = lowerA - lowerB;
271
// the k-line to start the forward search
272
int upK = upperA - upperB;
273
// the k-line to start the reverse search
274
int delta = (upperA - lowerA) - (upperB - lowerB);
275
bool oddDelta = (delta & 1) != 0;
277
// The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based
278
// and are access using a specific offset: UpOffset UpVector and DownOffset for DownVektor
279
int downOffset = MAX - downK;
280
int upOffset = MAX - upK;
282
int MaxD = ((upperA - lowerA + upperB - lowerB) / 2) + 1;
284
// Debug.Write(2, "SMS", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", LowerA, UpperA, LowerB, UpperB));
287
downVector[downOffset + downK + 1] = lowerA;
288
upVector[upOffset + upK - 1] = upperA;
290
for (int D = 0; D <= MaxD; D++) {
292
// Extend the forward path.
293
for (int k = downK - D; k <= downK + D; k += 2) {
294
// Debug.Write(0, "SMS", "extend forward path " + k.ToString());
296
// find the only or better starting point
298
if (k == downK - D) {
299
x = downVector[downOffset + k + 1];
302
x = downVector[downOffset + k - 1] + 1;
303
// a step to the right
304
if (k < downK + D && downVector[downOffset + k + 1] >= x)
305
x = downVector[downOffset + k + 1];
310
// find the end of the furthest reaching forward D-path in diagonal k.
311
while (x < upperA && y < upperB && dataA.Data[x].Equals (dataB.Data[y])) {
315
downVector[downOffset + k] = x;
318
if (oddDelta && upK - D < k && k < upK + D) {
319
if (upVector[upOffset + k] <= downVector[downOffset + k]) {
320
ret.x = downVector[downOffset + k];
321
ret.y = downVector[downOffset + k] - k;
322
// ret.u = UpVector[UpOffset + k]; // 2002.09.20: no need for 2 points
323
// ret.v = UpVector[UpOffset + k] - k;
331
// Extend the reverse path.
332
for (int k = upK - D; k <= upK + D; k += 2) {
333
// Debug.Write(0, "SMS", "extend reverse path " + k.ToString());
335
// find the only or better starting point
338
x = upVector[upOffset + k - 1];
341
x = upVector[upOffset + k + 1] - 1;
343
if (k > upK - D && upVector[upOffset + k - 1] < x)
344
x = upVector[upOffset + k - 1];
350
while (x > lowerA && y > lowerB && dataA.Data[x - 1].Equals (dataB.Data[y - 1])) {
355
upVector[upOffset + k] = x;
358
if (!oddDelta && downK - D <= k && k <= downK + D) {
359
if (upVector[upOffset + k] <= downVector[downOffset + k]) {
360
ret.x = downVector[downOffset + k];
361
ret.y = downVector[downOffset + k] - k;
362
// ret.u = UpVector[UpOffset + k]; // 2002.09.20: no need for 2 points
363
// ret.v = UpVector[UpOffset + k] - k;
373
throw new ApplicationException ("the algorithm should never come here.");
378
/// This is the divide-and-conquer implementation of the longes common-subsequence (LCS)
380
/// The published algorithm passes recursively parts of the A and B sequences.
381
/// To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant.
383
/// <param name="dataA">sequence A</param>
384
/// <param name="lowerA">lower bound of the actual range in DataA</param>
385
/// <param name="upperA">upper bound of the actual range in DataA (exclusive)</param>
386
/// <param name="dataB">sequence B</param>
387
/// <param name="lowerB">lower bound of the actual range in DataB</param>
388
/// <param name="upperB">upper bound of the actual range in DataB (exclusive)</param>
389
/// <param name="downVector">a vector for the (0,0) to (x,y) search. Passed as a parameter for speed reasons.</param>
390
/// <param name="upVector">a vector for the (u,v) to (N,M) search. Passed as a parameter for speed reasons.</param>
391
static void LCS<T> (DiffData<T> dataA, int lowerA, int upperA, DiffData<T> dataB, int lowerB, int upperB, int[] downVector, int[] upVector)
393
// Fast walkthrough equal lines at the start
394
while (lowerA < upperA && lowerB < upperB && dataA.Data[lowerA].Equals (dataB.Data[lowerB])) {
399
// Fast walkthrough equal lines at the end
400
while (lowerA < upperA && lowerB < upperB && dataA.Data[upperA - 1].Equals (dataB.Data[upperB - 1])) {
405
if (lowerA == upperA) {
406
// mark as inserted lines.
407
while (lowerB < upperB)
408
dataB.Modified[lowerB++] = true;
410
} else if (lowerB == upperB) {
411
// mark as deleted lines.
412
while (lowerA < upperA)
413
dataA.Modified[lowerA++] = true;
416
// Find the middle snakea and length of an optimal path for A and B
417
SMSRD smsrd = SMS (dataA, lowerA, upperA, dataB, lowerB, upperB, downVector, upVector);
418
// Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y));
420
// The path is from LowerX to (x,y) and (x,y) to UpperX
421
LCS (dataA, lowerA, smsrd.x, dataB, lowerB, smsrd.y, downVector, upVector);
422
LCS (dataA, smsrd.x, upperA, dataB, smsrd.y, upperB, downVector, upVector);
423
// 2002.09.20: no need for 2 points
429
public static string GetDiffString (Document baseDocument, Document changedDocument)
431
return GetDiffString (baseDocument.Diff (changedDocument), baseDocument, changedDocument, baseDocument.FileName, changedDocument.FileName);
434
public static string GetDiffString (IEnumerable<Hunk> diff, Document baseDocument, Document changedDocument, string baseFileName, string changedFileName)
438
StringBuilder sb = new StringBuilder ();
439
sb.AppendLine ("--- " + baseFileName);
440
sb.AppendLine ("+++ " + changedFileName);
442
foreach (var item in diff) {
443
int remStart = System.Math.Max (1, item.RemoveStart - 3);
444
int remEnd = System.Math.Min (baseDocument.LineCount, item.RemoveStart + item.Removed + 3);
445
int insStart = System.Math.Max (1, item.InsertStart - 3);
446
int insEnd = System.Math.Min (changedDocument.LineCount, item.InsertStart + item.Inserted + 3);
448
sb.AppendLine ("@@ -" + remStart + "," + (remEnd - remStart) + " +" + insStart + "," + (insEnd - insStart) + " @@");
449
for (int i = System.Math.Min (remStart, insStart); i < item.RemoveStart; i++) {
450
sb.AppendLine (" " + baseDocument.GetLineText (i, false));
452
for (int i = item.RemoveStart; i < item.RemoveStart + item.Removed; i++) {
453
sb.AppendLine ("-" + baseDocument.GetLineText (i, false));
455
for (int i = item.InsertStart; i < item.InsertStart + item.Inserted; i++) {
456
sb.AppendLine ("+" + changedDocument.GetLineText (i, false));
458
for (int i = item.RemoveStart + item.Removed; i < remEnd; i++) {
459
sb.AppendLine (" " + baseDocument.GetLineText (i, false));
462
return sb.ToString ();
466
/// <summary>Data on one input file being compared.
470
/// <summary>Number of elements (lines).</summary>
471
public readonly int Length;
473
/// <summary>Buffer of numbers that will be compared.</summary>
474
public readonly T[] Data;
477
/// Array of booleans that flag for modified data.
478
/// This is the result of the diff.
479
/// This means deletedA in the first Data or inserted in the second Data.
481
public readonly bool[] Modified;
484
/// Initialize the Diff-Data buffer.
486
/// <param name="data">reference to the buffer</param>
487
public DiffData (T[] initData)
490
Length = initData.Length;
491
Modified = new bool[Length + 2];