~widelands-dev/widelands-website/django_staticfiles

« back to all changes in this revision

Viewing changes to diff_match_patch.py

  • Committer: Holger Rapp
  • Date: 2009-02-21 18:24:02 UTC
  • Revision ID: sirver@kallisto.local-20090221182402-k3tuf5c4gjwslbjf
Main Page contains now the same informations as before

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
#!/usr/bin/python2.4
2
 
 
3
 
# Taken from google because they do not provide a setup.py file. Thanks anyway
4
 
 
5
 
"""Diff Match and Patch.
6
 
 
7
 
Copyright 2006 Google Inc.
8
 
http://code.google.com/p/google-diff-match-patch/
9
 
 
10
 
Licensed under the Apache License, Version 2.0 (the "License");
11
 
you may not use this file except in compliance with the License.
12
 
You may obtain a copy of the License at
13
 
 
14
 
  http://www.apache.org/licenses/LICENSE-2.0
15
 
 
16
 
Unless required by applicable law or agreed to in writing, software
17
 
distributed under the License is distributed on an "AS IS" BASIS,
18
 
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
 
See the License for the specific language governing permissions and
20
 
limitations under the License.
21
 
 
22
 
"""
23
 
 
24
 
"""Functions for diff, match and patch.
25
 
 
26
 
Computes the difference between two texts to create a patch.
27
 
Applies the patch onto another text, allowing for errors.
28
 
"""
29
 
 
30
 
__author__ = 'fraser@google.com (Neil Fraser)'
31
 
 
32
 
import math
33
 
import time
34
 
import urllib
35
 
import re
36
 
 
37
 
 
38
 
class diff_match_patch:
39
 
    """Class containing the diff, match and patch methods.
40
 
 
41
 
    Also contains the behaviour settings.
42
 
 
43
 
    """
44
 
 
45
 
    def __init__(self):
46
 
        """Inits a diff_match_patch object with default settings.
47
 
 
48
 
        Redefine these in your program to override the defaults.
49
 
 
50
 
        """
51
 
 
52
 
        # Number of seconds to map a diff before giving up (0 for infinity).
53
 
        self.Diff_Timeout = 1.0
54
 
        # Cost of an empty edit operation in terms of edit characters.
55
 
        self.Diff_EditCost = 4
56
 
        # The size beyond which the double-ended diff activates.
57
 
        # Double-ending is twice as fast, but less accurate.
58
 
        self.Diff_DualThreshold = 32
59
 
        # At what point is no match declared (0.0 = perfection, 1.0 = very
60
 
        # loose).
61
 
        self.Match_Threshold = 0.5
62
 
        # How far to search for a match (0 = exact location, 1000+ = broad match).
63
 
        # A match this many characters away from the expected location will add
64
 
        # 1.0 to the score (0.0 is a perfect match).
65
 
        self.Match_Distance = 1000
66
 
        # When deleting a large block of text (over ~64 characters), how close does
67
 
        # the contents have to match the expected contents. (0.0 = perfection,
68
 
        # 1.0 = very loose).  Note that Match_Threshold controls how closely the
69
 
        # end points of a delete need to match.
70
 
        self.Patch_DeleteThreshold = 0.5
71
 
        # Chunk size for context length.
72
 
        self.Patch_Margin = 4
73
 
 
74
 
        # How many bits in a number?
75
 
        # Python has no maximum, thus to disable patch splitting set to 0.
76
 
        # However to avoid long patches in certain pathological cases, use 32.
77
 
        # Multiple short patches (using native ints) are much faster than long
78
 
        # ones.
79
 
        self.Match_MaxBits = 32
80
 
 
81
 
    #  DIFF FUNCTIONS
82
 
 
83
 
    # The data structure representing a diff is an array of tuples:
84
 
    # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
85
 
    # which means: delete "Hello", add "Goodbye" and keep " world."
86
 
    DIFF_DELETE = -1
87
 
    DIFF_INSERT = 1
88
 
    DIFF_EQUAL = 0
89
 
 
90
 
    def diff_main(self, text1, text2, checklines=True):
91
 
        """Find the differences between two texts.  Simplifies the problem by
92
 
        stripping any common prefix or suffix off the texts before diffing.
93
 
 
94
 
        Args:
95
 
          text1: Old string to be diffed.
96
 
          text2: New string to be diffed.
97
 
          checklines: Optional speedup flag.  If present and false, then don't run
98
 
            a line-level diff first to identify the changed areas.
99
 
            Defaults to true, which does a faster, slightly less optimal diff.
100
 
 
101
 
        Returns:
102
 
          Array of changes.
103
 
 
104
 
        """
105
 
 
106
 
        # Check for equality (speedup)
107
 
        if text1 == text2:
108
 
            return [(self.DIFF_EQUAL, text1)]
109
 
 
110
 
        # Trim off common prefix (speedup)
111
 
        commonlength = self.diff_commonPrefix(text1, text2)
112
 
        commonprefix = text1[:commonlength]
113
 
        text1 = text1[commonlength:]
114
 
        text2 = text2[commonlength:]
115
 
 
116
 
        # Trim off common suffix (speedup)
117
 
        commonlength = self.diff_commonSuffix(text1, text2)
118
 
        if commonlength == 0:
119
 
            commonsuffix = ''
120
 
        else:
121
 
            commonsuffix = text1[-commonlength:]
122
 
            text1 = text1[:-commonlength]
123
 
            text2 = text2[:-commonlength]
124
 
 
125
 
        # Compute the diff on the middle block
126
 
        diffs = self.diff_compute(text1, text2, checklines)
127
 
 
128
 
        # Restore the prefix and suffix
129
 
        if commonprefix:
130
 
            diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
131
 
        if commonsuffix:
132
 
            diffs.append((self.DIFF_EQUAL, commonsuffix))
133
 
        self.diff_cleanupMerge(diffs)
134
 
        return diffs
135
 
 
136
 
    def diff_compute(self, text1, text2, checklines):
137
 
        """Find the differences between two texts.  Assumes that the texts do
138
 
        not have any common prefix or suffix.
139
 
 
140
 
        Args:
141
 
          text1: Old string to be diffed.
142
 
          text2: New string to be diffed.
143
 
          checklines: Speedup flag.  If false, then don't run a line-level diff
144
 
            first to identify the changed areas.
145
 
            If true, then run a faster, slightly less optimal diff.
146
 
 
147
 
        Returns:
148
 
          Array of changes.
149
 
 
150
 
        """
151
 
        if not text1:
152
 
            # Just add some text (speedup)
153
 
            return [(self.DIFF_INSERT, text2)]
154
 
 
155
 
        if not text2:
156
 
            # Just delete some text (speedup)
157
 
            return [(self.DIFF_DELETE, text1)]
158
 
 
159
 
        if len(text1) > len(text2):
160
 
            (longtext, shorttext) = (text1, text2)
161
 
        else:
162
 
            (shorttext, longtext) = (text1, text2)
163
 
        i = longtext.find(shorttext)
164
 
        if i != -1:
165
 
            # Shorter text is inside the longer text (speedup)
166
 
            diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
167
 
                     (self.DIFF_INSERT, longtext[i + len(shorttext):])]
168
 
            # Swap insertions for deletions if diff is reversed.
169
 
            if len(text1) > len(text2):
170
 
                diffs[0] = (self.DIFF_DELETE, diffs[0][1])
171
 
                diffs[2] = (self.DIFF_DELETE, diffs[2][1])
172
 
            return diffs
173
 
        longtext = shorttext = None  # Garbage collect.
174
 
 
175
 
        # Check to see if the problem can be split in two.
176
 
        hm = self.diff_halfMatch(text1, text2)
177
 
        if hm:
178
 
            # A half-match was found, sort out the return data.
179
 
            (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
180
 
            # Send both pairs off for separate processing.
181
 
            diffs_a = self.diff_main(text1_a, text2_a, checklines)
182
 
            diffs_b = self.diff_main(text1_b, text2_b, checklines)
183
 
            # Merge the results.
184
 
            return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
185
 
 
186
 
        # Perform a real diff.
187
 
        if checklines and (len(text1) < 100 or len(text2) < 100):
188
 
            checklines = False  # Too trivial for the overhead.
189
 
        if checklines:
190
 
            # Scan the text on a line-by-line basis first.
191
 
            (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
192
 
 
193
 
        diffs = self.diff_map(text1, text2)
194
 
        if not diffs:  # No acceptable result.
195
 
            diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
196
 
        if checklines:
197
 
            # Convert the diff back to original text.
198
 
            self.diff_charsToLines(diffs, linearray)
199
 
            # Eliminate freak matches (e.g. blank lines)
200
 
            self.diff_cleanupSemantic(diffs)
201
 
 
202
 
            # Rediff any replacement blocks, this time character-by-character.
203
 
            # Add a dummy entry at the end.
204
 
            diffs.append((self.DIFF_EQUAL, ''))
205
 
            pointer = 0
206
 
            count_delete = 0
207
 
            count_insert = 0
208
 
            text_delete = ''
209
 
            text_insert = ''
210
 
            while pointer < len(diffs):
211
 
                if diffs[pointer][0] == self.DIFF_INSERT:
212
 
                    count_insert += 1
213
 
                    text_insert += diffs[pointer][1]
214
 
                elif diffs[pointer][0] == self.DIFF_DELETE:
215
 
                    count_delete += 1
216
 
                    text_delete += diffs[pointer][1]
217
 
                elif diffs[pointer][0] == self.DIFF_EQUAL:
218
 
                    # Upon reaching an equality, check for prior redundancies.
219
 
                    if count_delete >= 1 and count_insert >= 1:
220
 
                        # Delete the offending records and add the merged ones.
221
 
                        a = self.diff_main(text_delete, text_insert, False)
222
 
                        diffs[pointer - count_delete -
223
 
                              count_insert: pointer] = a
224
 
                        pointer = pointer - count_delete - \
225
 
                            count_insert + len(a)
226
 
                    count_insert = 0
227
 
                    count_delete = 0
228
 
                    text_delete = ''
229
 
                    text_insert = ''
230
 
 
231
 
                pointer += 1
232
 
 
233
 
            diffs.pop()  # Remove the dummy entry at the end.
234
 
        return diffs
235
 
 
236
 
    def diff_linesToChars(self, text1, text2):
237
 
        """Split two texts into an array of strings.  Reduce the texts to a
238
 
        string of hashes where each Unicode character represents one line.
239
 
 
240
 
        Args:
241
 
          text1: First string.
242
 
          text2: Second string.
243
 
 
244
 
        Returns:
245
 
          Three element tuple, containing the encoded text1, the encoded text2 and
246
 
          the array of unique strings.  The zeroth element of the array of unique
247
 
          strings is intentionally blank.
248
 
 
249
 
        """
250
 
        lineArray = []  # e.g. lineArray[4] == "Hello\n"
251
 
        lineHash = {}   # e.g. lineHash["Hello\n"] == 4
252
 
 
253
 
        # "\x00" is a valid character, but various debuggers don't like it.
254
 
        # So we'll insert a junk entry to avoid generating a null character.
255
 
        lineArray.append('')
256
 
 
257
 
        def diff_linesToCharsMunge(text):
258
 
            """Split a text into an array of strings.  Reduce the texts to a
259
 
            string of hashes where each Unicode character represents one line.
260
 
            Modifies linearray and linehash through being a closure.
261
 
 
262
 
            Args:
263
 
              text: String to encode.
264
 
 
265
 
            Returns:
266
 
              Encoded string.
267
 
 
268
 
            """
269
 
            chars = []
270
 
            # Walk the text, pulling out a substring for each line.
271
 
            # text.split('\n') would would temporarily double our memory footprint.
272
 
            # Modifying text would create many large strings to garbage
273
 
            # collect.
274
 
            lineStart = 0
275
 
            lineEnd = -1
276
 
            while lineEnd < len(text) - 1:
277
 
                lineEnd = text.find('\n', lineStart)
278
 
                if lineEnd == -1:
279
 
                    lineEnd = len(text) - 1
280
 
                line = text[lineStart:lineEnd + 1]
281
 
                lineStart = lineEnd + 1
282
 
 
283
 
                if line in lineHash:
284
 
                    chars.append(unichr(lineHash[line]))
285
 
                else:
286
 
                    lineArray.append(line)
287
 
                    lineHash[line] = len(lineArray) - 1
288
 
                    chars.append(unichr(len(lineArray) - 1))
289
 
            return ''.join(chars)
290
 
 
291
 
        chars1 = diff_linesToCharsMunge(text1)
292
 
        chars2 = diff_linesToCharsMunge(text2)
293
 
        return (chars1, chars2, lineArray)
294
 
 
295
 
    def diff_charsToLines(self, diffs, lineArray):
296
 
        """Rehydrate the text in a diff from a string of line hashes to real
297
 
        lines of text.
298
 
 
299
 
        Args:
300
 
          diffs: Array of diff tuples.
301
 
          lineArray: Array of unique strings.
302
 
 
303
 
        """
304
 
        for x in xrange(len(diffs)):
305
 
            text = []
306
 
            for char in diffs[x][1]:
307
 
                text.append(lineArray[ord(char)])
308
 
            diffs[x] = (diffs[x][0], ''.join(text))
309
 
 
310
 
    def diff_map(self, text1, text2):
311
 
        """Explore the intersection points between the two texts.
312
 
 
313
 
        Args:
314
 
          text1: Old string to be diffed.
315
 
          text2: New string to be diffed.
316
 
 
317
 
        Returns:
318
 
          Array of diff tuples or None if no diff available.
319
 
 
320
 
        """
321
 
 
322
 
        # Unlike in most languages, Python counts time in seconds.
323
 
        s_end = time.time() + self.Diff_Timeout  # Don't run for too long.
324
 
        # Cache the text lengths to prevent multiple calls.
325
 
        text1_length = len(text1)
326
 
        text2_length = len(text2)
327
 
        max_d = text1_length + text2_length - 1
328
 
        doubleEnd = self.Diff_DualThreshold * 2 < max_d
329
 
        v_map1 = []
330
 
        v_map2 = []
331
 
        v1 = {}
332
 
        v2 = {}
333
 
        v1[1] = 0
334
 
        v2[1] = 0
335
 
        footsteps = {}
336
 
        done = False
337
 
        # If the total number of characters is odd, then the front path will
338
 
        # collide with the reverse path.
339
 
        front = (text1_length + text2_length) % 2
340
 
        for d in xrange(max_d):
341
 
            # Bail out if timeout reached.
342
 
            if self.Diff_Timeout > 0 and time.time() > s_end:
343
 
                return None
344
 
 
345
 
            # Walk the front path one step.
346
 
            v_map1.append({})
347
 
            for k in xrange(-d, d + 1, 2):
348
 
                if k == -d or k != d and v1[k - 1] < v1[k + 1]:
349
 
                    x = v1[k + 1]
350
 
                else:
351
 
                    x = v1[k - 1] + 1
352
 
                y = x - k
353
 
                if doubleEnd:
354
 
                    footstep = (x, y)
355
 
                    if front and footstep in footsteps:
356
 
                        done = True
357
 
                    if not front:
358
 
                        footsteps[footstep] = d
359
 
 
360
 
                while (not done and x < text1_length and y < text2_length and
361
 
                       text1[x] == text2[y]):
362
 
                    x += 1
363
 
                    y += 1
364
 
                    if doubleEnd:
365
 
                        footstep = (x, y)
366
 
                        if front and footstep in footsteps:
367
 
                            done = True
368
 
                        if not front:
369
 
                            footsteps[footstep] = d
370
 
 
371
 
                v1[k] = x
372
 
                v_map1[d][(x, y)] = True
373
 
                if x == text1_length and y == text2_length:
374
 
                    # Reached the end in single-path mode.
375
 
                    return self.diff_path1(v_map1, text1, text2)
376
 
                elif done:
377
 
                    # Front path ran over reverse path.
378
 
                    v_map2 = v_map2[:footsteps[footstep] + 1]
379
 
                    a = self.diff_path1(v_map1, text1[:x], text2[:y])
380
 
                    b = self.diff_path2(v_map2, text1[x:], text2[y:])
381
 
                    return a + b
382
 
 
383
 
            if doubleEnd:
384
 
                # Walk the reverse path one step.
385
 
                v_map2.append({})
386
 
                for k in xrange(-d, d + 1, 2):
387
 
                    if k == -d or k != d and v2[k - 1] < v2[k + 1]:
388
 
                        x = v2[k + 1]
389
 
                    else:
390
 
                        x = v2[k - 1] + 1
391
 
                    y = x - k
392
 
                    footstep = (text1_length - x, text2_length - y)
393
 
                    if not front and footstep in footsteps:
394
 
                        done = True
395
 
                    if front:
396
 
                        footsteps[footstep] = d
397
 
                    while (not done and x < text1_length and y < text2_length and
398
 
                           text1[-x - 1] == text2[-y - 1]):
399
 
                        x += 1
400
 
                        y += 1
401
 
                        footstep = (text1_length - x, text2_length - y)
402
 
                        if not front and footstep in footsteps:
403
 
                            done = True
404
 
                        if front:
405
 
                            footsteps[footstep] = d
406
 
 
407
 
                    v2[k] = x
408
 
                    v_map2[d][(x, y)] = True
409
 
                    if done:
410
 
                        # Reverse path ran over front path.
411
 
                        v_map1 = v_map1[:footsteps[footstep] + 1]
412
 
                        a = self.diff_path1(v_map1, text1[:text1_length - x],
413
 
                                            text2[:text2_length - y])
414
 
                        b = self.diff_path2(v_map2, text1[text1_length - x:],
415
 
                                            text2[text2_length - y:])
416
 
                        return a + b
417
 
 
418
 
        # Number of diffs equals number of characters, no commonality at all.
419
 
        return None
420
 
 
421
 
    def diff_path1(self, v_map, text1, text2):
422
 
        """Work from the middle back to the start to determine the path.
423
 
 
424
 
        Args:
425
 
          v_map: Array of paths.
426
 
          text1: Old string fragment to be diffed.
427
 
          text2: New string fragment to be diffed.
428
 
 
429
 
        Returns:
430
 
          Array of diff tuples.
431
 
 
432
 
        """
433
 
        path = []
434
 
        x = len(text1)
435
 
        y = len(text2)
436
 
        last_op = None
437
 
        for d in xrange(len(v_map) - 2, -1, -1):
438
 
            while True:
439
 
                if (x - 1, y) in v_map[d]:
440
 
                    x -= 1
441
 
                    if last_op == self.DIFF_DELETE:
442
 
                        path[0] = (self.DIFF_DELETE, text1[x] + path[0][1])
443
 
                    else:
444
 
                        path[:0] = [(self.DIFF_DELETE, text1[x])]
445
 
                    last_op = self.DIFF_DELETE
446
 
                    break
447
 
                elif (x, y - 1) in v_map[d]:
448
 
                    y -= 1
449
 
                    if last_op == self.DIFF_INSERT:
450
 
                        path[0] = (self.DIFF_INSERT, text2[y] + path[0][1])
451
 
                    else:
452
 
                        path[:0] = [(self.DIFF_INSERT, text2[y])]
453
 
                    last_op = self.DIFF_INSERT
454
 
                    break
455
 
                else:
456
 
                    x -= 1
457
 
                    y -= 1
458
 
                    assert text1[x] == text2[y], ('No diagonal.  ' +
459
 
                                                  "Can't happen. (diff_path1)")
460
 
                    if last_op == self.DIFF_EQUAL:
461
 
                        path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1])
462
 
                    else:
463
 
                        path[:0] = [(self.DIFF_EQUAL, text1[x])]
464
 
                    last_op = self.DIFF_EQUAL
465
 
        return path
466
 
 
467
 
    def diff_path2(self, v_map, text1, text2):
468
 
        """Work from the middle back to the end to determine the path.
469
 
 
470
 
        Args:
471
 
          v_map: Array of paths.
472
 
          text1: Old string fragment to be diffed.
473
 
          text2: New string fragment to be diffed.
474
 
 
475
 
        Returns:
476
 
          Array of diff tuples.
477
 
 
478
 
        """
479
 
        path = []
480
 
        x = len(text1)
481
 
        y = len(text2)
482
 
        last_op = None
483
 
        for d in xrange(len(v_map) - 2, -1, -1):
484
 
            while True:
485
 
                if (x - 1, y) in v_map[d]:
486
 
                    x -= 1
487
 
                    if last_op == self.DIFF_DELETE:
488
 
                        path[-1] = (self.DIFF_DELETE, path[-1]
489
 
                                    [1] + text1[-x - 1])
490
 
                    else:
491
 
                        path.append((self.DIFF_DELETE, text1[-x - 1]))
492
 
                    last_op = self.DIFF_DELETE
493
 
                    break
494
 
                elif (x, y - 1) in v_map[d]:
495
 
                    y -= 1
496
 
                    if last_op == self.DIFF_INSERT:
497
 
                        path[-1] = (self.DIFF_INSERT, path[-1]
498
 
                                    [1] + text2[-y - 1])
499
 
                    else:
500
 
                        path.append((self.DIFF_INSERT, text2[-y - 1]))
501
 
                    last_op = self.DIFF_INSERT
502
 
                    break
503
 
                else:
504
 
                    x -= 1
505
 
                    y -= 1
506
 
                    assert text1[-x - 1] == text2[-y - 1], ('No diagonal.  ' +
507
 
                                                            "Can't happen. (diff_path2)")
508
 
                    if last_op == self.DIFF_EQUAL:
509
 
                        path[-1] = (self.DIFF_EQUAL, path[-1]
510
 
                                    [1] + text1[-x - 1])
511
 
                    else:
512
 
                        path.append((self.DIFF_EQUAL, text1[-x - 1]))
513
 
                    last_op = self.DIFF_EQUAL
514
 
        return path
515
 
 
516
 
    def diff_commonPrefix(self, text1, text2):
517
 
        """Determine the common prefix of two strings.
518
 
 
519
 
        Args:
520
 
          text1: First string.
521
 
          text2: Second string.
522
 
 
523
 
        Returns:
524
 
          The number of characters common to the start of each string.
525
 
 
526
 
        """
527
 
        # Quick check for common null cases.
528
 
        if not text1 or not text2 or text1[0] != text2[0]:
529
 
            return 0
530
 
        # Binary search.
531
 
        # Performance analysis: http://neil.fraser.name/news/2007/10/09/
532
 
        pointermin = 0
533
 
        pointermax = min(len(text1), len(text2))
534
 
        pointermid = pointermax
535
 
        pointerstart = 0
536
 
        while pointermin < pointermid:
537
 
            if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
538
 
                pointermin = pointermid
539
 
                pointerstart = pointermin
540
 
            else:
541
 
                pointermax = pointermid
542
 
            pointermid = int((pointermax - pointermin) / 2 + pointermin)
543
 
        return pointermid
544
 
 
545
 
    def diff_commonSuffix(self, text1, text2):
546
 
        """Determine the common suffix of two strings.
547
 
 
548
 
        Args:
549
 
          text1: First string.
550
 
          text2: Second string.
551
 
 
552
 
        Returns:
553
 
          The number of characters common to the end of each string.
554
 
 
555
 
        """
556
 
        # Quick check for common null cases.
557
 
        if not text1 or not text2 or text1[-1] != text2[-1]:
558
 
            return 0
559
 
        # Binary search.
560
 
        # Performance analysis: http://neil.fraser.name/news/2007/10/09/
561
 
        pointermin = 0
562
 
        pointermax = min(len(text1), len(text2))
563
 
        pointermid = pointermax
564
 
        pointerend = 0
565
 
        while pointermin < pointermid:
566
 
            if (text1[-pointermid:len(text1) - pointerend] ==
567
 
                    text2[-pointermid:len(text2) - pointerend]):
568
 
                pointermin = pointermid
569
 
                pointerend = pointermin
570
 
            else:
571
 
                pointermax = pointermid
572
 
            pointermid = int((pointermax - pointermin) / 2 + pointermin)
573
 
        return pointermid
574
 
 
575
 
    def diff_halfMatch(self, text1, text2):
576
 
        """Do the two texts share a substring which is at least half the length
577
 
        of the longer text?
578
 
 
579
 
        Args:
580
 
          text1: First string.
581
 
          text2: Second string.
582
 
 
583
 
        Returns:
584
 
          Five element Array, containing the prefix of text1, the suffix of text1,
585
 
          the prefix of text2, the suffix of text2 and the common middle.  Or None
586
 
          if there was no match.
587
 
 
588
 
        """
589
 
        if len(text1) > len(text2):
590
 
            (longtext, shorttext) = (text1, text2)
591
 
        else:
592
 
            (shorttext, longtext) = (text1, text2)
593
 
        if len(longtext) < 10 or len(shorttext) < 1:
594
 
            return None  # Pointless.
595
 
 
596
 
        def diff_halfMatchI(longtext, shorttext, i):
597
 
            """Does a substring of shorttext exist within longtext such that
598
 
            the substring is at least half the length of longtext? Closure, but
599
 
            does not reference any external variables.
600
 
 
601
 
            Args:
602
 
              longtext: Longer string.
603
 
              shorttext: Shorter string.
604
 
              i: Start index of quarter length substring within longtext.
605
 
 
606
 
            Returns:
607
 
              Five element Array, containing the prefix of longtext, the suffix of
608
 
              longtext, the prefix of shorttext, the suffix of shorttext and the
609
 
              common middle.  Or None if there was no match.
610
 
 
611
 
            """
612
 
            seed = longtext[i:i + len(longtext) / 4]
613
 
            best_common = ''
614
 
            j = shorttext.find(seed)
615
 
            while j != -1:
616
 
                prefixLength = self.diff_commonPrefix(
617
 
                    longtext[i:], shorttext[j:])
618
 
                suffixLength = self.diff_commonSuffix(
619
 
                    longtext[:i], shorttext[:j])
620
 
                if len(best_common) < suffixLength + prefixLength:
621
 
                    best_common = (shorttext[j - suffixLength:j] +
622
 
                                   shorttext[j:j + prefixLength])
623
 
                    best_longtext_a = longtext[:i - suffixLength]
624
 
                    best_longtext_b = longtext[i + prefixLength:]
625
 
                    best_shorttext_a = shorttext[:j - suffixLength]
626
 
                    best_shorttext_b = shorttext[j + prefixLength:]
627
 
                j = shorttext.find(seed, j + 1)
628
 
 
629
 
            if len(best_common) >= len(longtext) / 2:
630
 
                return (best_longtext_a, best_longtext_b,
631
 
                        best_shorttext_a, best_shorttext_b, best_common)
632
 
            else:
633
 
                return None
634
 
 
635
 
        # First check if the second quarter is the seed for a half-match.
636
 
        hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)
637
 
        # Check again based on the third quarter.
638
 
        hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)
639
 
        if not hm1 and not hm2:
640
 
            return None
641
 
        elif not hm2:
642
 
            hm = hm1
643
 
        elif not hm1:
644
 
            hm = hm2
645
 
        else:
646
 
            # Both matched.  Select the longest.
647
 
            if len(hm1[4]) > len(hm2[4]):
648
 
                hm = hm1
649
 
            else:
650
 
                hm = hm2
651
 
 
652
 
        # A half-match was found, sort out the return data.
653
 
        if len(text1) > len(text2):
654
 
            (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
655
 
        else:
656
 
            (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
657
 
        return (text1_a, text1_b, text2_a, text2_b, mid_common)
658
 
 
659
 
    def diff_cleanupSemantic(self, diffs):
660
 
        """Reduce the number of edits by eliminating semantically trivial
661
 
        equalities.
662
 
 
663
 
        Args:
664
 
          diffs: Array of diff tuples.
665
 
 
666
 
        """
667
 
        changes = False
668
 
        equalities = []  # Stack of indices where equalities are found.
669
 
        lastequality = None  # Always equal to equalities[-1][1]
670
 
        pointer = 0  # Index of current position.
671
 
        # Number of chars that changed prior to the equality.
672
 
        length_changes1 = 0
673
 
        length_changes2 = 0  # Number of chars that changed after the equality.
674
 
        while pointer < len(diffs):
675
 
            if diffs[pointer][0] == self.DIFF_EQUAL:  # equality found
676
 
                equalities.append(pointer)
677
 
                length_changes1 = length_changes2
678
 
                length_changes2 = 0
679
 
                lastequality = diffs[pointer][1]
680
 
            else:  # an insertion or deletion
681
 
                length_changes2 += len(diffs[pointer][1])
682
 
                if (lastequality != None and (len(lastequality) <= length_changes1) and
683
 
                        (len(lastequality) <= length_changes2)):
684
 
                    # Duplicate record
685
 
                    diffs.insert(equalities[-1],
686
 
                                 (self.DIFF_DELETE, lastequality))
687
 
                    # Change second copy to insert.
688
 
                    diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
689
 
                                                 diffs[equalities[-1] + 1][1])
690
 
                    # Throw away the equality we just deleted.
691
 
                    equalities.pop()
692
 
                    # Throw away the previous equality (it needs to be
693
 
                    # reevaluated).
694
 
                    if len(equalities) != 0:
695
 
                        equalities.pop()
696
 
                    if len(equalities):
697
 
                        pointer = equalities[-1]
698
 
                    else:
699
 
                        pointer = -1
700
 
                    length_changes1 = 0  # Reset the counters.
701
 
                    length_changes2 = 0
702
 
                    lastequality = None
703
 
                    changes = True
704
 
            pointer += 1
705
 
 
706
 
        if changes:
707
 
            self.diff_cleanupMerge(diffs)
708
 
 
709
 
        self.diff_cleanupSemanticLossless(diffs)
710
 
 
711
 
    def diff_cleanupSemanticLossless(self, diffs):
712
 
        """Look for single edits surrounded on both sides by equalities
713
 
        which can be shifted sideways to align the edit to a word boundary.
714
 
        e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
715
 
 
716
 
        Args:
717
 
          diffs: Array of diff tuples.
718
 
        """
719
 
 
720
 
        def diff_cleanupSemanticScore(one, two):
721
 
            """Given two strings, compute a score representing whether the
722
 
            internal boundary falls on logical boundaries. Scores range from 5
723
 
            (best) to 0 (worst). Closure, but does not reference any external
724
 
            variables.
725
 
 
726
 
            Args:
727
 
              one: First string.
728
 
              two: Second string.
729
 
 
730
 
            Returns:
731
 
              The score.
732
 
 
733
 
            """
734
 
            if not one or not two:
735
 
                # Edges are the best.
736
 
                return 5
737
 
 
738
 
            # Each port of this function behaves slightly differently due to
739
 
            # subtle differences in each language's definition of things like
740
 
            # 'whitespace'.  Since this function's purpose is largely cosmetic,
741
 
            # the choice has been made to use each language's native features
742
 
            # rather than force total conformity.
743
 
            score = 0
744
 
            # One point for non-alphanumeric.
745
 
            if not one[-1].isalnum() or not two[0].isalnum():
746
 
                score += 1
747
 
                # Two points for whitespace.
748
 
                if one[-1].isspace() or two[0].isspace():
749
 
                    score += 1
750
 
                    # Three points for line breaks.
751
 
                    if (one[-1] == '\r' or one[-1] == '\n' or
752
 
                            two[0] == '\r' or two[0] == '\n'):
753
 
                        score += 1
754
 
                        # Four points for blank lines.
755
 
                        if (re.search('\\n\\r?\\n$', one) or
756
 
                                re.match('^\\r?\\n\\r?\\n', two)):
757
 
                            score += 1
758
 
            return score
759
 
 
760
 
        pointer = 1
761
 
        # Intentionally ignore the first and last element (don't need
762
 
        # checking).
763
 
        while pointer < len(diffs) - 1:
764
 
            if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
765
 
                    diffs[pointer + 1][0] == self.DIFF_EQUAL):
766
 
                # This is a single edit surrounded by equalities.
767
 
                equality1 = diffs[pointer - 1][1]
768
 
                edit = diffs[pointer][1]
769
 
                equality2 = diffs[pointer + 1][1]
770
 
 
771
 
                # First, shift the edit as far left as possible.
772
 
                commonOffset = self.diff_commonSuffix(equality1, edit)
773
 
                if commonOffset:
774
 
                    commonString = edit[-commonOffset:]
775
 
                    equality1 = equality1[:-commonOffset]
776
 
                    edit = commonString + edit[:-commonOffset]
777
 
                    equality2 = commonString + equality2
778
 
 
779
 
                # Second, step character by character right, looking for the
780
 
                # best fit.
781
 
                bestEquality1 = equality1
782
 
                bestEdit = edit
783
 
                bestEquality2 = equality2
784
 
                bestScore = (diff_cleanupSemanticScore(equality1, edit) +
785
 
                             diff_cleanupSemanticScore(edit, equality2))
786
 
                while edit and equality2 and edit[0] == equality2[0]:
787
 
                    equality1 += edit[0]
788
 
                    edit = edit[1:] + equality2[0]
789
 
                    equality2 = equality2[1:]
790
 
                    score = (diff_cleanupSemanticScore(equality1, edit) +
791
 
                             diff_cleanupSemanticScore(edit, equality2))
792
 
                    # The >= encourages trailing rather than leading whitespace
793
 
                    # on edits.
794
 
                    if score >= bestScore:
795
 
                        bestScore = score
796
 
                        bestEquality1 = equality1
797
 
                        bestEdit = edit
798
 
                        bestEquality2 = equality2
799
 
 
800
 
                if diffs[pointer - 1][1] != bestEquality1:
801
 
                    # We have an improvement, save it back to the diff.
802
 
                    if bestEquality1:
803
 
                        diffs[pointer - 1] = (diffs[pointer - 1]
804
 
                                              [0], bestEquality1)
805
 
                    else:
806
 
                        del diffs[pointer - 1]
807
 
                        pointer -= 1
808
 
                    diffs[pointer] = (diffs[pointer][0], bestEdit)
809
 
                    if bestEquality2:
810
 
                        diffs[pointer + 1] = (diffs[pointer + 1]
811
 
                                              [0], bestEquality2)
812
 
                    else:
813
 
                        del diffs[pointer + 1]
814
 
                        pointer -= 1
815
 
            pointer += 1
816
 
 
817
 
    def diff_cleanupEfficiency(self, diffs):
818
 
        """Reduce the number of edits by eliminating operationally trivial
819
 
        equalities.
820
 
 
821
 
        Args:
822
 
          diffs: Array of diff tuples.
823
 
 
824
 
        """
825
 
        changes = False
826
 
        equalities = []  # Stack of indices where equalities are found.
827
 
        lastequality = ''  # Always equal to equalities[-1][1]
828
 
        pointer = 0  # Index of current position.
829
 
        # Is there an insertion operation before the last equality.
830
 
        pre_ins = False
831
 
        # Is there a deletion operation before the last equality.
832
 
        pre_del = False
833
 
        # Is there an insertion operation after the last equality.
834
 
        post_ins = False
835
 
        # Is there a deletion operation after the last equality.
836
 
        post_del = False
837
 
        while pointer < len(diffs):
838
 
            if diffs[pointer][0] == self.DIFF_EQUAL:  # equality found
839
 
                if (len(diffs[pointer][1]) < self.Diff_EditCost and
840
 
                        (post_ins or post_del)):
841
 
                    # Candidate found.
842
 
                    equalities.append(pointer)
843
 
                    pre_ins = post_ins
844
 
                    pre_del = post_del
845
 
                    lastequality = diffs[pointer][1]
846
 
                else:
847
 
                    # Not a candidate, and can never become one.
848
 
                    equalities = []
849
 
                    lastequality = ''
850
 
 
851
 
                post_ins = post_del = False
852
 
            else:  # an insertion or deletion
853
 
                if diffs[pointer][0] == self.DIFF_DELETE:
854
 
                    post_del = True
855
 
                else:
856
 
                    post_ins = True
857
 
 
858
 
                # Five types to be split:
859
 
                # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
860
 
                # <ins>A</ins>X<ins>C</ins><del>D</del>
861
 
                # <ins>A</ins><del>B</del>X<ins>C</ins>
862
 
                # <ins>A</del>X<ins>C</ins><del>D</del>
863
 
                # <ins>A</ins><del>B</del>X<del>C</del>
864
 
 
865
 
                if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
866
 
                                     ((len(lastequality) < self.Diff_EditCost / 2) and
867
 
                                      (pre_ins + pre_del + post_ins + post_del) == 3)):
868
 
                    # Duplicate record
869
 
                    diffs.insert(equalities[-1],
870
 
                                 (self.DIFF_DELETE, lastequality))
871
 
                    # Change second copy to insert.
872
 
                    diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
873
 
                                                 diffs[equalities[-1] + 1][1])
874
 
                    equalities.pop()  # Throw away the equality we just deleted
875
 
                    lastequality = ''
876
 
                    if pre_ins and pre_del:
877
 
                        # No changes made which could affect previous entry,
878
 
                        # keep going.
879
 
                        post_ins = post_del = True
880
 
                        equalities = []
881
 
                    else:
882
 
                        if len(equalities):
883
 
                            equalities.pop()  # Throw away the previous equality
884
 
                        if len(equalities):
885
 
                            pointer = equalities[-1]
886
 
                        else:
887
 
                            pointer = -1
888
 
                        post_ins = post_del = False
889
 
                    changes = True
890
 
            pointer += 1
891
 
 
892
 
        if changes:
893
 
            self.diff_cleanupMerge(diffs)
894
 
 
895
 
    def diff_cleanupMerge(self, diffs):
896
 
        """Reorder and merge like edit sections.  Merge equalities. Any edit
897
 
        section can move as long as it doesn't cross an equality.
898
 
 
899
 
        Args:
900
 
          diffs: Array of diff tuples.
901
 
 
902
 
        """
903
 
        diffs.append((self.DIFF_EQUAL, ''))  # Add a dummy entry at the end.
904
 
        pointer = 0
905
 
        count_delete = 0
906
 
        count_insert = 0
907
 
        text_delete = ''
908
 
        text_insert = ''
909
 
        while pointer < len(diffs):
910
 
            if diffs[pointer][0] == self.DIFF_INSERT:
911
 
                count_insert += 1
912
 
                text_insert += diffs[pointer][1]
913
 
                pointer += 1
914
 
            elif diffs[pointer][0] == self.DIFF_DELETE:
915
 
                count_delete += 1
916
 
                text_delete += diffs[pointer][1]
917
 
                pointer += 1
918
 
            elif diffs[pointer][0] == self.DIFF_EQUAL:
919
 
                # Upon reaching an equality, check for prior redundancies.
920
 
                if count_delete != 0 or count_insert != 0:
921
 
                    if count_delete != 0 and count_insert != 0:
922
 
                        # Factor out any common prefixies.
923
 
                        commonlength = self.diff_commonPrefix(
924
 
                            text_insert, text_delete)
925
 
                        if commonlength != 0:
926
 
                            x = pointer - count_delete - count_insert - 1
927
 
                            if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
928
 
                                diffs[x] = (diffs[x][0], diffs[x][1] +
929
 
                                            text_insert[:commonlength])
930
 
                            else:
931
 
                                diffs.insert(
932
 
                                    0, (self.DIFF_EQUAL, text_insert[:commonlength]))
933
 
                                pointer += 1
934
 
                            text_insert = text_insert[commonlength:]
935
 
                            text_delete = text_delete[commonlength:]
936
 
                        # Factor out any common suffixies.
937
 
                        commonlength = self.diff_commonSuffix(
938
 
                            text_insert, text_delete)
939
 
                        if commonlength != 0:
940
 
                            diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
941
 
                                              diffs[pointer][1])
942
 
                            text_insert = text_insert[:-commonlength]
943
 
                            text_delete = text_delete[:-commonlength]
944
 
                    # Delete the offending records and add the merged ones.
945
 
                    if count_delete == 0:
946
 
                        diffs[pointer - count_insert: pointer] = [
947
 
                            (self.DIFF_INSERT, text_insert)]
948
 
                    elif count_insert == 0:
949
 
                        diffs[pointer - count_delete: pointer] = [
950
 
                            (self.DIFF_DELETE, text_delete)]
951
 
                    else:
952
 
                        diffs[pointer - count_delete - count_insert: pointer] = [
953
 
                            (self.DIFF_DELETE, text_delete),
954
 
                            (self.DIFF_INSERT, text_insert)]
955
 
                    pointer = pointer - count_delete - count_insert + 1
956
 
                    if count_delete != 0:
957
 
                        pointer += 1
958
 
                    if count_insert != 0:
959
 
                        pointer += 1
960
 
                elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
961
 
                    # Merge this equality with the previous one.
962
 
                    diffs[pointer - 1] = (diffs[pointer - 1][0],
963
 
                                          diffs[pointer - 1][1] + diffs[pointer][1])
964
 
                    del diffs[pointer]
965
 
                else:
966
 
                    pointer += 1
967
 
 
968
 
                count_insert = 0
969
 
                count_delete = 0
970
 
                text_delete = ''
971
 
                text_insert = ''
972
 
 
973
 
        if diffs[-1][1] == '':
974
 
            diffs.pop()  # Remove the dummy entry at the end.
975
 
 
976
 
        # Second pass: look for single edits surrounded on both sides by equalities
977
 
        # which can be shifted sideways to eliminate an equality.
978
 
        # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
979
 
        changes = False
980
 
        pointer = 1
981
 
        # Intentionally ignore the first and last element (don't need
982
 
        # checking).
983
 
        while pointer < len(diffs) - 1:
984
 
            if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
985
 
                    diffs[pointer + 1][0] == self.DIFF_EQUAL):
986
 
                # This is a single edit surrounded by equalities.
987
 
                if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
988
 
                    # Shift the edit over the previous equality.
989
 
                    diffs[pointer] = (diffs[pointer][0],
990
 
                                      diffs[pointer - 1][1] +
991
 
                                      diffs[pointer][1][:-len(diffs[pointer - 1][1])])
992
 
                    diffs[pointer + 1] = (diffs[pointer + 1][0],
993
 
                                          diffs[pointer - 1][1] + diffs[pointer + 1][1])
994
 
                    del diffs[pointer - 1]
995
 
                    changes = True
996
 
                elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
997
 
                    # Shift the edit over the next equality.
998
 
                    diffs[pointer - 1] = (diffs[pointer - 1][0],
999
 
                                          diffs[pointer - 1][1] + diffs[pointer + 1][1])
1000
 
                    diffs[pointer] = (diffs[pointer][0],
1001
 
                                      diffs[pointer][1][len(diffs[pointer + 1][1]):] +
1002
 
                                      diffs[pointer + 1][1])
1003
 
                    del diffs[pointer + 1]
1004
 
                    changes = True
1005
 
            pointer += 1
1006
 
 
1007
 
        # If shifts were made, the diff needs reordering and another shift
1008
 
        # sweep.
1009
 
        if changes:
1010
 
            self.diff_cleanupMerge(diffs)
1011
 
 
1012
 
    def diff_xIndex(self, diffs, loc):
1013
 
        """loc is a location in text1, compute and return the equivalent location
1014
 
        in text2.  e.g. "The cat" vs "The big cat", 1->1, 5->8
1015
 
 
1016
 
        Args:
1017
 
          diffs: Array of diff tuples.
1018
 
          loc: Location within text1.
1019
 
 
1020
 
        Returns:
1021
 
          Location within text2.
1022
 
        """
1023
 
        chars1 = 0
1024
 
        chars2 = 0
1025
 
        last_chars1 = 0
1026
 
        last_chars2 = 0
1027
 
        for x in xrange(len(diffs)):
1028
 
            (op, text) = diffs[x]
1029
 
            if op != self.DIFF_INSERT:  # Equality or deletion.
1030
 
                chars1 += len(text)
1031
 
            if op != self.DIFF_DELETE:  # Equality or insertion.
1032
 
                chars2 += len(text)
1033
 
            if chars1 > loc:  # Overshot the location.
1034
 
                break
1035
 
            last_chars1 = chars1
1036
 
            last_chars2 = chars2
1037
 
 
1038
 
        if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1039
 
            # The location was deleted.
1040
 
            return last_chars2
1041
 
        # Add the remaining len(character).
1042
 
        return last_chars2 + (loc - last_chars1)
1043
 
 
1044
 
    def diff_prettyHtml(self, diffs):
1045
 
        """Convert a diff array into a pretty HTML report.
1046
 
 
1047
 
        Args:
1048
 
          diffs: Array of diff tuples.
1049
 
 
1050
 
        Returns:
1051
 
          HTML representation.
1052
 
 
1053
 
        """
1054
 
        html = []
1055
 
        i = 0
1056
 
        for (op, data) in diffs:
1057
 
            text = (data.replace('&', '&amp;').replace('<', '&lt;')
1058
 
                    .replace('>', '&gt;').replace('\n', '&para;<BR>'))
1059
 
            if op == self.DIFF_INSERT:
1060
 
                html.append("<SPAN STYLE=\"color: #00FF00;\" TITLE=\"i=%i\">%s</SPAN>"
1061
 
                            % (i, text))
1062
 
            elif op == self.DIFF_DELETE:
1063
 
                html.append("<SPAN STYLE=\"color: #FF0000;\" TITLE=\"i=%i\">%s</SPAN>"
1064
 
                            % (i, text))
1065
 
            elif op == self.DIFF_EQUAL:
1066
 
                html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text))
1067
 
            if op != self.DIFF_DELETE:
1068
 
                i += len(data)
1069
 
        return ''.join(html)
1070
 
 
1071
 
    def diff_text1(self, diffs):
1072
 
        """Compute and return the source text (all equalities and deletions).
1073
 
 
1074
 
        Args:
1075
 
          diffs: Array of diff tuples.
1076
 
 
1077
 
        Returns:
1078
 
          Source text.
1079
 
 
1080
 
        """
1081
 
        text = []
1082
 
        for (op, data) in diffs:
1083
 
            if op != self.DIFF_INSERT:
1084
 
                text.append(data)
1085
 
        return ''.join(text)
1086
 
 
1087
 
    def diff_text2(self, diffs):
1088
 
        """Compute and return the destination text (all equalities and
1089
 
        insertions).
1090
 
 
1091
 
        Args:
1092
 
          diffs: Array of diff tuples.
1093
 
 
1094
 
        Returns:
1095
 
          Destination text.
1096
 
 
1097
 
        """
1098
 
        text = []
1099
 
        for (op, data) in diffs:
1100
 
            if op != self.DIFF_DELETE:
1101
 
                text.append(data)
1102
 
        return ''.join(text)
1103
 
 
1104
 
    def diff_levenshtein(self, diffs):
1105
 
        """Compute the Levenshtein distance; the number of inserted, deleted or
1106
 
        substituted characters.
1107
 
 
1108
 
        Args:
1109
 
          diffs: Array of diff tuples.
1110
 
 
1111
 
        Returns:
1112
 
          Number of changes.
1113
 
 
1114
 
        """
1115
 
        levenshtein = 0
1116
 
        insertions = 0
1117
 
        deletions = 0
1118
 
        for (op, data) in diffs:
1119
 
            if op == self.DIFF_INSERT:
1120
 
                insertions += len(data)
1121
 
            elif op == self.DIFF_DELETE:
1122
 
                deletions += len(data)
1123
 
            elif op == self.DIFF_EQUAL:
1124
 
                # A deletion and an insertion is one substitution.
1125
 
                levenshtein += max(insertions, deletions)
1126
 
                insertions = 0
1127
 
                deletions = 0
1128
 
        levenshtein += max(insertions, deletions)
1129
 
        return levenshtein
1130
 
 
1131
 
    def diff_toDelta(self, diffs):
1132
 
        """Crush the diff into an encoded string which describes the operations
1133
 
        required to transform text1 into text2.
1134
 
        E.g. =3\t-2\t+ing  -> Keep 3 chars, delete 2 chars, insert 'ing'.
1135
 
        Operations are tab-separated.  Inserted text is escaped using %xx notation.
1136
 
 
1137
 
        Args:
1138
 
          diffs: Array of diff tuples.
1139
 
 
1140
 
        Returns:
1141
 
          Delta text.
1142
 
        """
1143
 
        text = []
1144
 
        for (op, data) in diffs:
1145
 
            if op == self.DIFF_INSERT:
1146
 
                # High ascii will raise UnicodeDecodeError.  Use Unicode
1147
 
                # instead.
1148
 
                data = data.encode('utf-8')
1149
 
                text.append('+' + urllib.quote(data, "!~*'();/?:@&=+$,# "))
1150
 
            elif op == self.DIFF_DELETE:
1151
 
                text.append('-%d' % len(data))
1152
 
            elif op == self.DIFF_EQUAL:
1153
 
                text.append('=%d' % len(data))
1154
 
        return '\t'.join(text)
1155
 
 
1156
 
    def diff_fromDelta(self, text1, delta):
1157
 
        """Given the original text1, and an encoded string which describes the
1158
 
        operations required to transform text1 into text2, compute the full
1159
 
        diff.
1160
 
 
1161
 
        Args:
1162
 
          text1: Source string for the diff.
1163
 
          delta: Delta text.
1164
 
 
1165
 
        Returns:
1166
 
          Array of diff tuples.
1167
 
 
1168
 
        Raises:
1169
 
          ValueError: If invalid input.
1170
 
 
1171
 
        """
1172
 
        if type(delta) == unicode:
1173
 
            # Deltas should be composed of a subset of ascii chars, Unicode not
1174
 
            # required.  If this encode raises UnicodeEncodeError, delta is
1175
 
            # invalid.
1176
 
            delta = delta.encode('ascii')
1177
 
        diffs = []
1178
 
        pointer = 0  # Cursor in text1
1179
 
        tokens = delta.split('\t')
1180
 
        for token in tokens:
1181
 
            if token == '':
1182
 
                # Blank tokens are ok (from a trailing \t).
1183
 
                continue
1184
 
            # Each token begins with a one character parameter which specifies the
1185
 
            # operation of this token (delete, insert, equality).
1186
 
            param = token[1:]
1187
 
            if token[0] == '+':
1188
 
                param = urllib.unquote(param).decode('utf-8')
1189
 
                diffs.append((self.DIFF_INSERT, param))
1190
 
            elif token[0] == '-' or token[0] == '=':
1191
 
                try:
1192
 
                    n = int(param)
1193
 
                except ValueError:
1194
 
                    raise ValueError, 'Invalid number in diff_fromDelta: ' + param
1195
 
                if n < 0:
1196
 
                    raise ValueError, 'Negative number in diff_fromDelta: ' + param
1197
 
                text = text1[pointer: pointer + n]
1198
 
                pointer += n
1199
 
                if token[0] == '=':
1200
 
                    diffs.append((self.DIFF_EQUAL, text))
1201
 
                else:
1202
 
                    diffs.append((self.DIFF_DELETE, text))
1203
 
            else:
1204
 
                # Anything else is an error.
1205
 
                raise ValueError, ('Invalid diff operation in diff_fromDelta: ' +
1206
 
                                   token[0])
1207
 
        if pointer != len(text1):
1208
 
            raise ValueError, (
1209
 
                'Delta length (%d) does not equal source text length (%d).' %
1210
 
                (pointer, len(text1)))
1211
 
        return diffs
1212
 
 
1213
 
    #  MATCH FUNCTIONS
1214
 
 
1215
 
    def match_main(self, text, pattern, loc):
1216
 
        """Locate the best instance of 'pattern' in 'text' near 'loc'.
1217
 
 
1218
 
        Args:
1219
 
          text: The text to search.
1220
 
          pattern: The pattern to search for.
1221
 
          loc: The location to search around.
1222
 
 
1223
 
        Returns:
1224
 
          Best match index or -1.
1225
 
 
1226
 
        """
1227
 
        loc = max(0, min(loc, len(text)))
1228
 
        if text == pattern:
1229
 
            # Shortcut (potentially not guaranteed by the algorithm)
1230
 
            return 0
1231
 
        elif not text:
1232
 
            # Nothing to match.
1233
 
            return -1
1234
 
        elif text[loc:loc + len(pattern)] == pattern:
1235
 
            # Perfect match at the perfect spot!  (Includes case of null
1236
 
            # pattern)
1237
 
            return loc
1238
 
        else:
1239
 
            # Do a fuzzy compare.
1240
 
            match = self.match_bitap(text, pattern, loc)
1241
 
            return match
1242
 
 
1243
 
    def match_bitap(self, text, pattern, loc):
1244
 
        """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1245
 
        Bitap algorithm.
1246
 
 
1247
 
        Args:
1248
 
          text: The text to search.
1249
 
          pattern: The pattern to search for.
1250
 
          loc: The location to search around.
1251
 
 
1252
 
        Returns:
1253
 
          Best match index or -1.
1254
 
 
1255
 
        """
1256
 
        # Python doesn't have a maxint limit, so ignore this check.
1257
 
        # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1258
 
        #  raise ValueError("Pattern too long for this application.")
1259
 
 
1260
 
        # Initialise the alphabet.
1261
 
        s = self.match_alphabet(pattern)
1262
 
 
1263
 
        def match_bitapScore(e, x):
1264
 
            """Compute and return the score for a match with e errors and x
1265
 
            location. Accesses loc and pattern through being a closure.
1266
 
 
1267
 
            Args:
1268
 
              e: Number of errors in match.
1269
 
              x: Location of match.
1270
 
 
1271
 
            Returns:
1272
 
              Overall score for match (0.0 = good, 1.0 = bad).
1273
 
 
1274
 
            """
1275
 
            accuracy = float(e) / len(pattern)
1276
 
            proximity = abs(loc - x)
1277
 
            if not self.Match_Distance:
1278
 
                # Dodge divide by zero error.
1279
 
                return proximity and 1.0 or accuracy
1280
 
            return accuracy + (proximity / float(self.Match_Distance))
1281
 
 
1282
 
        # Highest score beyond which we give up.
1283
 
        score_threshold = self.Match_Threshold
1284
 
        # Is there a nearby exact match? (speedup)
1285
 
        best_loc = text.find(pattern, loc)
1286
 
        if best_loc != -1:
1287
 
            score_threshold = min(match_bitapScore(
1288
 
                0, best_loc), score_threshold)
1289
 
            # What about in the other direction? (speedup)
1290
 
            best_loc = text.rfind(pattern, loc + len(pattern))
1291
 
            if best_loc != -1:
1292
 
                score_threshold = min(match_bitapScore(
1293
 
                    0, best_loc), score_threshold)
1294
 
 
1295
 
        # Initialise the bit arrays.
1296
 
        matchmask = 1 << (len(pattern) - 1)
1297
 
        best_loc = -1
1298
 
 
1299
 
        bin_max = len(pattern) + len(text)
1300
 
        # Empty initialization added to appease pychecker.
1301
 
        last_rd = None
1302
 
        for d in xrange(len(pattern)):
1303
 
            # Scan for the best match each iteration allows for one more error.
1304
 
            # Run a binary search to determine how far from 'loc' we can stray at
1305
 
            # this error level.
1306
 
            bin_min = 0
1307
 
            bin_mid = bin_max
1308
 
            while bin_min < bin_mid:
1309
 
                if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1310
 
                    bin_min = bin_mid
1311
 
                else:
1312
 
                    bin_max = bin_mid
1313
 
                bin_mid = (bin_max - bin_min) / 2 + bin_min
1314
 
 
1315
 
            # Use the result from this iteration as the maximum for the next.
1316
 
            bin_max = bin_mid
1317
 
            start = max(1, loc - bin_mid + 1)
1318
 
            finish = min(loc + bin_mid, len(text)) + len(pattern)
1319
 
 
1320
 
            rd = range(finish + 1)
1321
 
            rd.append((1 << d) - 1)
1322
 
            for j in xrange(finish, start - 1, -1):
1323
 
                if len(text) <= j - 1:
1324
 
                    # Out of range.
1325
 
                    charMatch = 0
1326
 
                else:
1327
 
                    charMatch = s.get(text[j - 1], 0)
1328
 
                if d == 0:  # First pass: exact match.
1329
 
                    rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1330
 
                else:  # Subsequent passes: fuzzy match.
1331
 
                    rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (
1332
 
                        ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1333
 
                if rd[j] & matchmask:
1334
 
                    score = match_bitapScore(d, j - 1)
1335
 
                    # This match will almost certainly be better than any existing match.
1336
 
                    # But check anyway.
1337
 
                    if score <= score_threshold:
1338
 
                        # Told you so.
1339
 
                        score_threshold = score
1340
 
                        best_loc = j - 1
1341
 
                        if best_loc > loc:
1342
 
                            # When passing loc, don't exceed our current
1343
 
                            # distance from loc.
1344
 
                            start = max(1, 2 * loc - best_loc)
1345
 
                        else:
1346
 
                            # Already passed loc, downhill from here on in.
1347
 
                            break
1348
 
            # No hope for a (better) match at greater error levels.
1349
 
            if match_bitapScore(d + 1, loc) > score_threshold:
1350
 
                break
1351
 
            last_rd = rd
1352
 
        return best_loc
1353
 
 
1354
 
    def match_alphabet(self, pattern):
1355
 
        """Initialise the alphabet for the Bitap algorithm.
1356
 
 
1357
 
        Args:
1358
 
          pattern: The text to encode.
1359
 
 
1360
 
        Returns:
1361
 
          Hash of character locations.
1362
 
 
1363
 
        """
1364
 
        s = {}
1365
 
        for char in pattern:
1366
 
            s[char] = 0
1367
 
        for i in xrange(len(pattern)):
1368
 
            s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1369
 
        return s
1370
 
 
1371
 
    #  PATCH FUNCTIONS
1372
 
 
1373
 
    def patch_addContext(self, patch, text):
1374
 
        """Increase the context until it is unique, but don't let the pattern
1375
 
        expand beyond Match_MaxBits.
1376
 
 
1377
 
        Args:
1378
 
          patch: The patch to grow.
1379
 
          text: Source text.
1380
 
 
1381
 
        """
1382
 
        if len(text) == 0:
1383
 
            return
1384
 
        pattern = text[patch.start2: patch.start2 + patch.length1]
1385
 
        padding = 0
1386
 
 
1387
 
        # Look for the first and last matches of pattern in text.  If two different
1388
 
        # matches are found, increase the pattern length.
1389
 
        while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
1390
 
                                                              0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
1391
 
                                                              self.Patch_Margin)):
1392
 
            padding += self.Patch_Margin
1393
 
            pattern = text[max(0, patch.start2 - padding):
1394
 
                           patch.start2 + patch.length1 + padding]
1395
 
        # Add one chunk for good luck.
1396
 
        padding += self.Patch_Margin
1397
 
 
1398
 
        # Add the prefix.
1399
 
        prefix = text[max(0, patch.start2 - padding): patch.start2]
1400
 
        if prefix:
1401
 
            patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1402
 
        # Add the suffix.
1403
 
        suffix = text[patch.start2 + patch.length1:
1404
 
                      patch.start2 + patch.length1 + padding]
1405
 
        if suffix:
1406
 
            patch.diffs.append((self.DIFF_EQUAL, suffix))
1407
 
 
1408
 
        # Roll back the start points.
1409
 
        patch.start1 -= len(prefix)
1410
 
        patch.start2 -= len(prefix)
1411
 
        # Extend lengths.
1412
 
        patch.length1 += len(prefix) + len(suffix)
1413
 
        patch.length2 += len(prefix) + len(suffix)
1414
 
 
1415
 
    def patch_make(self, a, b=None, c=None):
1416
 
        """Compute a list of patches to turn text1 into text2.
1417
 
        Use diffs if provided, otherwise compute it ourselves.
1418
 
        There are four ways to call this function, depending on what data is
1419
 
        available to the caller:
1420
 
        Method 1:
1421
 
        a = text1, b = text2
1422
 
        Method 2:
1423
 
        a = diffs
1424
 
        Method 3 (optimal):
1425
 
        a = text1, b = diffs
1426
 
        Method 4 (deprecated, use method 3):
1427
 
        a = text1, b = text2, c = diffs
1428
 
 
1429
 
        Args:
1430
 
          a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1431
 
              text2 (method 2).
1432
 
          b: text2 (methods 1,4) or Array of diff tuples for text1 to
1433
 
              text2 (method 3) or undefined (method 2).
1434
 
          c: Array of diff tuples for text1 to text2 (method 4) or
1435
 
              undefined (methods 1,2,3).
1436
 
 
1437
 
        Returns:
1438
 
          Array of patch objects.
1439
 
        """
1440
 
        text1 = None
1441
 
        diffs = None
1442
 
        # Note that texts may arrive as 'str' or 'unicode'.
1443
 
        if isinstance(a, basestring) and isinstance(b, basestring) and c is None:
1444
 
            # Method 1: text1, text2
1445
 
            # Compute diffs from text1 and text2.
1446
 
            text1 = a
1447
 
            diffs = self.diff_main(text1, b, True)
1448
 
            if len(diffs) > 2:
1449
 
                self.diff_cleanupSemantic(diffs)
1450
 
                self.diff_cleanupEfficiency(diffs)
1451
 
        elif isinstance(a, list) and b is None and c is None:
1452
 
            # Method 2: diffs
1453
 
            # Compute text1 from diffs.
1454
 
            diffs = a
1455
 
            text1 = self.diff_text1(diffs)
1456
 
        elif isinstance(a, basestring) and isinstance(b, list) and c is None:
1457
 
            # Method 3: text1, diffs
1458
 
            text1 = a
1459
 
            diffs = b
1460
 
        elif (isinstance(a, basestring) and isinstance(b, basestring) and
1461
 
              isinstance(c, list)):
1462
 
            # Method 4: text1, text2, diffs
1463
 
            # text2 is not used.
1464
 
            text1 = a
1465
 
            diffs = c
1466
 
        else:
1467
 
            raise ValueError('Unknown call format to patch_make.')
1468
 
 
1469
 
        if not diffs:
1470
 
            return []  # Get rid of the None case.
1471
 
        patches = []
1472
 
        patch = patch_obj()
1473
 
        char_count1 = 0  # Number of characters into the text1 string.
1474
 
        char_count2 = 0  # Number of characters into the text2 string.
1475
 
        # Recreate the patches to determine context info.
1476
 
        prepatch_text = text1
1477
 
        postpatch_text = text1
1478
 
        for x in xrange(len(diffs)):
1479
 
            (diff_type, diff_text) = diffs[x]
1480
 
            if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1481
 
                # A new patch starts here.
1482
 
                patch.start1 = char_count1
1483
 
                patch.start2 = char_count2
1484
 
            if diff_type == self.DIFF_INSERT:
1485
 
                # Insertion
1486
 
                patch.diffs.append(diffs[x])
1487
 
                patch.length2 += len(diff_text)
1488
 
                postpatch_text = (postpatch_text[:char_count2] + diff_text +
1489
 
                                  postpatch_text[char_count2:])
1490
 
            elif diff_type == self.DIFF_DELETE:
1491
 
                # Deletion.
1492
 
                patch.length1 += len(diff_text)
1493
 
                patch.diffs.append(diffs[x])
1494
 
                postpatch_text = (postpatch_text[:char_count2] +
1495
 
                                  postpatch_text[char_count2 + len(diff_text):])
1496
 
            elif (diff_type == self.DIFF_EQUAL and
1497
 
                  len(diff_text) <= 2 * self.Patch_Margin and
1498
 
                  len(patch.diffs) != 0 and len(diffs) != x + 1):
1499
 
                # Small equality inside a patch.
1500
 
                patch.diffs.append(diffs[x])
1501
 
                patch.length1 += len(diff_text)
1502
 
                patch.length2 += len(diff_text)
1503
 
 
1504
 
            if (diff_type == self.DIFF_EQUAL and
1505
 
                    len(diff_text) >= 2 * self.Patch_Margin):
1506
 
                # Time for a new patch.
1507
 
                if len(patch.diffs) != 0:
1508
 
                    self.patch_addContext(patch, prepatch_text)
1509
 
                    patches.append(patch)
1510
 
                    patch = patch_obj()
1511
 
                    # Unlike Unidiff, our patch lists have a rolling context.
1512
 
                    # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1513
 
                    # Update prepatch text & pos to reflect the application of the
1514
 
                    # just completed patch.
1515
 
                    prepatch_text = postpatch_text
1516
 
                    char_count1 = char_count2
1517
 
 
1518
 
            # Update the current character count.
1519
 
            if diff_type != self.DIFF_INSERT:
1520
 
                char_count1 += len(diff_text)
1521
 
            if diff_type != self.DIFF_DELETE:
1522
 
                char_count2 += len(diff_text)
1523
 
 
1524
 
        # Pick up the leftover patch if not empty.
1525
 
        if len(patch.diffs) != 0:
1526
 
            self.patch_addContext(patch, prepatch_text)
1527
 
            patches.append(patch)
1528
 
        return patches
1529
 
 
1530
 
    def patch_deepCopy(self, patches):
1531
 
        """Given an array of patches, return another array that is identical.
1532
 
 
1533
 
        Args:
1534
 
          patches: Array of patch objects.
1535
 
 
1536
 
        Returns:
1537
 
          Array of patch objects.
1538
 
 
1539
 
        """
1540
 
        patchesCopy = []
1541
 
        for patch in patches:
1542
 
            patchCopy = patch_obj()
1543
 
            # No need to deep copy the tuples since they are immutable.
1544
 
            patchCopy.diffs = patch.diffs[:]
1545
 
            patchCopy.start1 = patch.start1
1546
 
            patchCopy.start2 = patch.start2
1547
 
            patchCopy.length1 = patch.length1
1548
 
            patchCopy.length2 = patch.length2
1549
 
            patchesCopy.append(patchCopy)
1550
 
        return patchesCopy
1551
 
 
1552
 
    def patch_apply(self, patches, text):
1553
 
        """Merge a set of patches onto the text.  Return a patched text, as
1554
 
        well as a list of true/false values indicating which patches were
1555
 
        applied.
1556
 
 
1557
 
        Args:
1558
 
          patches: Array of patch objects.
1559
 
          text: Old text.
1560
 
 
1561
 
        Returns:
1562
 
          Two element Array, containing the new text and an array of boolean values.
1563
 
 
1564
 
        """
1565
 
        if not patches:
1566
 
            return (text, [])
1567
 
 
1568
 
        # Deep copy the patches so that no changes are made to originals.
1569
 
        patches = self.patch_deepCopy(patches)
1570
 
 
1571
 
        nullPadding = self.patch_addPadding(patches)
1572
 
        text = nullPadding + text + nullPadding
1573
 
        self.patch_splitMax(patches)
1574
 
 
1575
 
        # delta keeps track of the offset between the expected and actual location
1576
 
        # of the previous patch.  If there are patches expected at positions 10 and
1577
 
        # 20, but the first patch was found at 12, delta is 2 and the second patch
1578
 
        # has an effective expected position of 22.
1579
 
        delta = 0
1580
 
        results = []
1581
 
        for patch in patches:
1582
 
            expected_loc = patch.start2 + delta
1583
 
            text1 = self.diff_text1(patch.diffs)
1584
 
            end_loc = -1
1585
 
            if len(text1) > self.Match_MaxBits:
1586
 
                # patch_splitMax will only provide an oversized pattern in the case of
1587
 
                # a monster delete.
1588
 
                start_loc = self.match_main(text, text1[:self.Match_MaxBits],
1589
 
                                            expected_loc)
1590
 
                if start_loc != -1:
1591
 
                    end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
1592
 
                                              expected_loc + len(text1) - self.Match_MaxBits)
1593
 
                    if end_loc == -1 or start_loc >= end_loc:
1594
 
                        # Can't find valid trailing context.  Drop this patch.
1595
 
                        start_loc = -1
1596
 
            else:
1597
 
                start_loc = self.match_main(text, text1, expected_loc)
1598
 
            if start_loc == -1:
1599
 
                # No match found.  :(
1600
 
                results.append(False)
1601
 
                # Subtract the delta for this failed patch from subsequent
1602
 
                # patches.
1603
 
                delta -= patch.length2 - patch.length1
1604
 
            else:
1605
 
                # Found a match.  :)
1606
 
                results.append(True)
1607
 
                delta = start_loc - expected_loc
1608
 
                if end_loc == -1:
1609
 
                    text2 = text[start_loc: start_loc + len(text1)]
1610
 
                else:
1611
 
                    text2 = text[start_loc: end_loc + self.Match_MaxBits]
1612
 
                if text1 == text2:
1613
 
                    # Perfect match, just shove the replacement text in.
1614
 
                    text = (text[:start_loc] + self.diff_text2(patch.diffs) +
1615
 
                            text[start_loc + len(text1):])
1616
 
                else:
1617
 
                    # Imperfect match.
1618
 
                    # Run a diff to get a framework of equivalent indices.
1619
 
                    diffs = self.diff_main(text1, text2, False)
1620
 
                    if (len(text1) > self.Match_MaxBits and
1621
 
                            self.diff_levenshtein(diffs) / float(len(text1)) >
1622
 
                            self.Patch_DeleteThreshold):
1623
 
                        # The end points match, but the content is unacceptably
1624
 
                        # bad.
1625
 
                        results[-1] = False
1626
 
                    else:
1627
 
                        self.diff_cleanupSemanticLossless(diffs)
1628
 
                        index1 = 0
1629
 
                        for (op, data) in patch.diffs:
1630
 
                            if op != self.DIFF_EQUAL:
1631
 
                                index2 = self.diff_xIndex(diffs, index1)
1632
 
                            if op == self.DIFF_INSERT:  # Insertion
1633
 
                                text = text[:start_loc + index2] + data + text[start_loc +
1634
 
                                                                               index2:]
1635
 
                            elif op == self.DIFF_DELETE:  # Deletion
1636
 
                                text = text[:start_loc + index2] + text[start_loc +
1637
 
                                                                        self.diff_xIndex(diffs, index1 + len(data)):]
1638
 
                            if op != self.DIFF_DELETE:
1639
 
                                index1 += len(data)
1640
 
        # Strip the padding off.
1641
 
        text = text[len(nullPadding):-len(nullPadding)]
1642
 
        return (text, results)
1643
 
 
1644
 
    def patch_addPadding(self, patches):
1645
 
        """Add some padding on text start and end so that edges can match
1646
 
        something.  Intended to be called only from within patch_apply.
1647
 
 
1648
 
        Args:
1649
 
          patches: Array of patch objects.
1650
 
 
1651
 
        Returns:
1652
 
          The padding string added to each side.
1653
 
 
1654
 
        """
1655
 
        paddingLength = self.Patch_Margin
1656
 
        nullPadding = ''
1657
 
        for x in xrange(1, paddingLength + 1):
1658
 
            nullPadding += chr(x)
1659
 
 
1660
 
        # Bump all the patches forward.
1661
 
        for patch in patches:
1662
 
            patch.start1 += paddingLength
1663
 
            patch.start2 += paddingLength
1664
 
 
1665
 
        # Add some padding on start of first diff.
1666
 
        patch = patches[0]
1667
 
        diffs = patch.diffs
1668
 
        if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1669
 
            # Add nullPadding equality.
1670
 
            diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1671
 
            patch.start1 -= paddingLength  # Should be 0.
1672
 
            patch.start2 -= paddingLength  # Should be 0.
1673
 
            patch.length1 += paddingLength
1674
 
            patch.length2 += paddingLength
1675
 
        elif paddingLength > len(diffs[0][1]):
1676
 
            # Grow first equality.
1677
 
            extraLength = paddingLength - len(diffs[0][1])
1678
 
            newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
1679
 
            diffs[0] = (diffs[0][0], newText)
1680
 
            patch.start1 -= extraLength
1681
 
            patch.start2 -= extraLength
1682
 
            patch.length1 += extraLength
1683
 
            patch.length2 += extraLength
1684
 
 
1685
 
        # Add some padding on end of last diff.
1686
 
        patch = patches[-1]
1687
 
        diffs = patch.diffs
1688
 
        if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1689
 
            # Add nullPadding equality.
1690
 
            diffs.append((self.DIFF_EQUAL, nullPadding))
1691
 
            patch.length1 += paddingLength
1692
 
            patch.length2 += paddingLength
1693
 
        elif paddingLength > len(diffs[-1][1]):
1694
 
            # Grow last equality.
1695
 
            extraLength = paddingLength - len(diffs[-1][1])
1696
 
            newText = diffs[-1][1] + nullPadding[:extraLength]
1697
 
            diffs[-1] = (diffs[-1][0], newText)
1698
 
            patch.length1 += extraLength
1699
 
            patch.length2 += extraLength
1700
 
 
1701
 
        return nullPadding
1702
 
 
1703
 
    def patch_splitMax(self, patches):
1704
 
        """Look through the patches and break up any which are longer than the
1705
 
        maximum limit of the match algorithm.
1706
 
 
1707
 
        Args:
1708
 
          patches: Array of patch objects.
1709
 
 
1710
 
        """
1711
 
        if self.Match_MaxBits == 0:
1712
 
            return
1713
 
        for x in xrange(len(patches)):
1714
 
            if patches[x].length1 > self.Match_MaxBits:
1715
 
                bigpatch = patches[x]
1716
 
                # Remove the big old patch.
1717
 
                del patches[x]
1718
 
                x -= 1
1719
 
                patch_size = self.Match_MaxBits
1720
 
                start1 = bigpatch.start1
1721
 
                start2 = bigpatch.start2
1722
 
                precontext = ''
1723
 
                while len(bigpatch.diffs) != 0:
1724
 
                    # Create one of several smaller patches.
1725
 
                    patch = patch_obj()
1726
 
                    empty = True
1727
 
                    patch.start1 = start1 - len(precontext)
1728
 
                    patch.start2 = start2 - len(precontext)
1729
 
                    if precontext:
1730
 
                        patch.length1 = patch.length2 = len(precontext)
1731
 
                        patch.diffs.append((self.DIFF_EQUAL, precontext))
1732
 
 
1733
 
                    while (len(bigpatch.diffs) != 0 and
1734
 
                           patch.length1 < patch_size - self.Patch_Margin):
1735
 
                        (diff_type, diff_text) = bigpatch.diffs[0]
1736
 
                        if diff_type == self.DIFF_INSERT:
1737
 
                            # Insertions are harmless.
1738
 
                            patch.length2 += len(diff_text)
1739
 
                            start2 += len(diff_text)
1740
 
                            patch.diffs.append(bigpatch.diffs.pop(0))
1741
 
                            empty = False
1742
 
                        elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
1743
 
                              patch.diffs[0][0] == self.DIFF_EQUAL and
1744
 
                              len(diff_text) > 2 * patch_size):
1745
 
                            # This is a large deletion.  Let it pass in one
1746
 
                            # chunk.
1747
 
                            patch.length1 += len(diff_text)
1748
 
                            start1 += len(diff_text)
1749
 
                            empty = False
1750
 
                            patch.diffs.append((diff_type, diff_text))
1751
 
                            del bigpatch.diffs[0]
1752
 
                        else:
1753
 
                            # Deletion or equality.  Only take as much as we
1754
 
                            # can stomach.
1755
 
                            diff_text = diff_text[:patch_size - patch.length1 -
1756
 
                                                  self.Patch_Margin]
1757
 
                            patch.length1 += len(diff_text)
1758
 
                            start1 += len(diff_text)
1759
 
                            if diff_type == self.DIFF_EQUAL:
1760
 
                                patch.length2 += len(diff_text)
1761
 
                                start2 += len(diff_text)
1762
 
                            else:
1763
 
                                empty = False
1764
 
 
1765
 
                            patch.diffs.append((diff_type, diff_text))
1766
 
                            if diff_text == bigpatch.diffs[0][1]:
1767
 
                                del bigpatch.diffs[0]
1768
 
                            else:
1769
 
                                bigpatch.diffs[0] = (bigpatch.diffs[0][0],
1770
 
                                                     bigpatch.diffs[0][1][len(diff_text):])
1771
 
 
1772
 
                    # Compute the head context for the next patch.
1773
 
                    precontext = self.diff_text2(patch.diffs)
1774
 
                    precontext = precontext[-self.Patch_Margin:]
1775
 
                    # Append the end context for this patch.
1776
 
                    postcontext = self.diff_text1(bigpatch.diffs)[
1777
 
                        :self.Patch_Margin]
1778
 
                    if postcontext:
1779
 
                        patch.length1 += len(postcontext)
1780
 
                        patch.length2 += len(postcontext)
1781
 
                        if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1782
 
                            patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
1783
 
                                               postcontext)
1784
 
                        else:
1785
 
                            patch.diffs.append((self.DIFF_EQUAL, postcontext))
1786
 
 
1787
 
                    if not empty:
1788
 
                        x += 1
1789
 
                        patches.insert(x, patch)
1790
 
 
1791
 
    def patch_toText(self, patches):
1792
 
        """Take a list of patches and return a textual representation.
1793
 
 
1794
 
        Args:
1795
 
          patches: Array of patch objects.
1796
 
 
1797
 
        Returns:
1798
 
          Text representation of patches.
1799
 
 
1800
 
        """
1801
 
        text = []
1802
 
        for patch in patches:
1803
 
            text.append(str(patch))
1804
 
        return ''.join(text)
1805
 
 
1806
 
    def patch_fromText(self, textline):
1807
 
        """Parse a textual representation of patches and return a list of patch
1808
 
        objects.
1809
 
 
1810
 
        Args:
1811
 
          textline: Text representation of patches.
1812
 
 
1813
 
        Returns:
1814
 
          Array of patch objects.
1815
 
 
1816
 
        Raises:
1817
 
          ValueError: If invalid input.
1818
 
 
1819
 
        """
1820
 
        if type(textline) == unicode:
1821
 
            # Patches should be composed of a subset of ascii chars, Unicode not
1822
 
            # required.  If this encode raises UnicodeEncodeError, patch is
1823
 
            # invalid.
1824
 
            textline = textline.encode('ascii')
1825
 
        patches = []
1826
 
        if not textline:
1827
 
            return patches
1828
 
        text = textline.split('\n')
1829
 
        while len(text) != 0:
1830
 
            m = re.match('^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$', text[0])
1831
 
            if not m:
1832
 
                raise ValueError, 'Invalid patch string: ' + text[0]
1833
 
            patch = patch_obj()
1834
 
            patches.append(patch)
1835
 
            patch.start1 = int(m.group(1))
1836
 
            if m.group(2) == '':
1837
 
                patch.start1 -= 1
1838
 
                patch.length1 = 1
1839
 
            elif m.group(2) == '0':
1840
 
                patch.length1 = 0
1841
 
            else:
1842
 
                patch.start1 -= 1
1843
 
                patch.length1 = int(m.group(2))
1844
 
 
1845
 
            patch.start2 = int(m.group(3))
1846
 
            if m.group(4) == '':
1847
 
                patch.start2 -= 1
1848
 
                patch.length2 = 1
1849
 
            elif m.group(4) == '0':
1850
 
                patch.length2 = 0
1851
 
            else:
1852
 
                patch.start2 -= 1
1853
 
                patch.length2 = int(m.group(4))
1854
 
 
1855
 
            del text[0]
1856
 
 
1857
 
            while len(text) != 0:
1858
 
                if text[0]:
1859
 
                    sign = text[0][0]
1860
 
                else:
1861
 
                    sign = ''
1862
 
                line = urllib.unquote(text[0][1:])
1863
 
                line = line.decode('utf-8')
1864
 
                if sign == '+':
1865
 
                    # Insertion.
1866
 
                    patch.diffs.append((self.DIFF_INSERT, line))
1867
 
                elif sign == '-':
1868
 
                    # Deletion.
1869
 
                    patch.diffs.append((self.DIFF_DELETE, line))
1870
 
                elif sign == ' ':
1871
 
                    # Minor equality.
1872
 
                    patch.diffs.append((self.DIFF_EQUAL, line))
1873
 
                elif sign == '@':
1874
 
                    # Start of next patch.
1875
 
                    break
1876
 
                elif sign == '':
1877
 
                    # Blank line?  Whatever.
1878
 
                    pass
1879
 
                else:
1880
 
                    # WTF?
1881
 
                    raise ValueError, "Invalid patch mode: '%s'\n%s" % (sign, line)
1882
 
                del text[0]
1883
 
        return patches
1884
 
 
1885
 
 
1886
 
class patch_obj:
1887
 
    """Class representing one patch operation."""
1888
 
 
1889
 
    def __init__(self):
1890
 
        """Initializes with an empty list of diffs."""
1891
 
        self.diffs = []
1892
 
        self.start1 = None
1893
 
        self.start2 = None
1894
 
        self.length1 = 0
1895
 
        self.length2 = 0
1896
 
 
1897
 
    def __str__(self):
1898
 
        """Emmulate GNU diff's format.
1899
 
        Header: @@ -382,8 +481,9 @@
1900
 
        Indicies are printed as 1-based, not 0-based.
1901
 
 
1902
 
        Returns:
1903
 
          The GNU diff string.
1904
 
        """
1905
 
        if self.length1 == 0:
1906
 
            coords1 = str(self.start1) + ',0'
1907
 
        elif self.length1 == 1:
1908
 
            coords1 = str(self.start1 + 1)
1909
 
        else:
1910
 
            coords1 = str(self.start1 + 1) + ',' + str(self.length1)
1911
 
        if self.length2 == 0:
1912
 
            coords2 = str(self.start2) + ',0'
1913
 
        elif self.length2 == 1:
1914
 
            coords2 = str(self.start2 + 1)
1915
 
        else:
1916
 
            coords2 = str(self.start2 + 1) + ',' + str(self.length2)
1917
 
        text = ['@@ -', coords1, ' +', coords2, ' @@\n']
1918
 
        # Escape the body of the patch with %xx notation.
1919
 
        for (op, data) in self.diffs:
1920
 
            if op == diff_match_patch.DIFF_INSERT:
1921
 
                text.append('+')
1922
 
            elif op == diff_match_patch.DIFF_DELETE:
1923
 
                text.append('-')
1924
 
            elif op == diff_match_patch.DIFF_EQUAL:
1925
 
                text.append(' ')
1926
 
            # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
1927
 
            data = data.encode('utf-8')
1928
 
            text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + '\n')
1929
 
        return ''.join(text)