~widelands-dev/widelands-website/move_minimaps

173.2.2 by Holger Rapp
Added diff_match_patch from google, because they do not provide a setup.py
1
#!/usr/bin/python2.4
2
3
# Taken from google because they do not provide a setup.py file. Thanks anyway
4
5
"""Diff Match and Patch
6
7
Copyright 2006 Google Inc.
8
http://code.google.com/p/google-diff-match-patch/
9
10
Licensed under the Apache License, Version 2.0 (the "License");
11
you may not use this file except in compliance with the License.
12
You may obtain a copy of the License at
13
14
  http://www.apache.org/licenses/LICENSE-2.0
15
16
Unless required by applicable law or agreed to in writing, software
17
distributed under the License is distributed on an "AS IS" BASIS,
18
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
See the License for the specific language governing permissions and
20
limitations under the License.
21
"""
22
23
"""Functions for diff, match and patch.
24
25
Computes the difference between two texts to create a patch.
26
Applies the patch onto another text, allowing for errors.
27
"""
28
29
__author__ = 'fraser@google.com (Neil Fraser)'
30
31
import math
32
import time
33
import urllib
34
import re
35
36
class diff_match_patch:
37
  """Class containing the diff, match and patch methods.
38
39
  Also contains the behaviour settings.
40
  """
41
42
  def __init__(self):
43
    """Inits a diff_match_patch object with default settings.
44
    Redefine these in your program to override the defaults.
45
    """
46
47
    # Number of seconds to map a diff before giving up (0 for infinity).
48
    self.Diff_Timeout = 1.0
49
    # Cost of an empty edit operation in terms of edit characters.
50
    self.Diff_EditCost = 4
51
    # The size beyond which the double-ended diff activates.
52
    # Double-ending is twice as fast, but less accurate.
53
    self.Diff_DualThreshold = 32
54
    # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
55
    self.Match_Threshold = 0.5
56
    # How far to search for a match (0 = exact location, 1000+ = broad match).
57
    # A match this many characters away from the expected location will add
58
    # 1.0 to the score (0.0 is a perfect match).
59
    self.Match_Distance = 1000
60
    # When deleting a large block of text (over ~64 characters), how close does
61
    # the contents have to match the expected contents. (0.0 = perfection,
62
    # 1.0 = very loose).  Note that Match_Threshold controls how closely the
63
    # end points of a delete need to match.
64
    self.Patch_DeleteThreshold = 0.5
65
    # Chunk size for context length.
66
    self.Patch_Margin = 4
67
68
    # How many bits in a number?
69
    # Python has no maximum, thus to disable patch splitting set to 0.
70
    # However to avoid long patches in certain pathological cases, use 32.
71
    # Multiple short patches (using native ints) are much faster than long ones.
72
    self.Match_MaxBits = 32
73
74
  #  DIFF FUNCTIONS
75
76
  # The data structure representing a diff is an array of tuples:
77
  # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
78
  # which means: delete "Hello", add "Goodbye" and keep " world."
79
  DIFF_DELETE = -1
80
  DIFF_INSERT = 1
81
  DIFF_EQUAL = 0
82
83
  def diff_main(self, text1, text2, checklines=True):
84
    """Find the differences between two texts.  Simplifies the problem by
85
      stripping any common prefix or suffix off the texts before diffing.
86
87
    Args:
88
      text1: Old string to be diffed.
89
      text2: New string to be diffed.
90
      checklines: Optional speedup flag.  If present and false, then don't run
91
        a line-level diff first to identify the changed areas.
92
        Defaults to true, which does a faster, slightly less optimal diff.
93
94
    Returns:
95
      Array of changes.
96
    """
97
98
    # Check for equality (speedup)
99
    if text1 == text2:
100
      return [(self.DIFF_EQUAL, text1)]
101
102
    # Trim off common prefix (speedup)
103
    commonlength = self.diff_commonPrefix(text1, text2)
104
    commonprefix = text1[:commonlength]
105
    text1 = text1[commonlength:]
106
    text2 = text2[commonlength:]
107
108
    # Trim off common suffix (speedup)
109
    commonlength = self.diff_commonSuffix(text1, text2)
110
    if commonlength == 0:
111
      commonsuffix = ''
112
    else:
113
      commonsuffix = text1[-commonlength:]
114
      text1 = text1[:-commonlength]
115
      text2 = text2[:-commonlength]
116
117
    # Compute the diff on the middle block
118
    diffs = self.diff_compute(text1, text2, checklines)
119
120
    # Restore the prefix and suffix
121
    if commonprefix:
122
      diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
123
    if commonsuffix:
124
      diffs.append((self.DIFF_EQUAL, commonsuffix))
125
    self.diff_cleanupMerge(diffs)
126
    return diffs
127
128
  def diff_compute(self, text1, text2, checklines):
129
    """Find the differences between two texts.  Assumes that the texts do not
130
      have any common prefix or suffix.
131
132
    Args:
133
      text1: Old string to be diffed.
134
      text2: New string to be diffed.
135
      checklines: Speedup flag.  If false, then don't run a line-level diff
136
        first to identify the changed areas.
137
        If true, then run a faster, slightly less optimal diff.
138
139
    Returns:
140
      Array of changes.
141
    """
142
    if not text1:
143
      # Just add some text (speedup)
144
      return [(self.DIFF_INSERT, text2)]
145
146
    if not text2:
147
      # Just delete some text (speedup)
148
      return [(self.DIFF_DELETE, text1)]
149
150
    if len(text1) > len(text2):
151
      (longtext, shorttext) = (text1, text2)
152
    else:
153
      (shorttext, longtext) = (text1, text2)
154
    i = longtext.find(shorttext)
155
    if i != -1:
156
      # Shorter text is inside the longer text (speedup)
157
      diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
158
               (self.DIFF_INSERT, longtext[i + len(shorttext):])]
159
      # Swap insertions for deletions if diff is reversed.
160
      if len(text1) > len(text2):
161
        diffs[0] = (self.DIFF_DELETE, diffs[0][1])
162
        diffs[2] = (self.DIFF_DELETE, diffs[2][1])
163
      return diffs
164
    longtext = shorttext = None  # Garbage collect.
165
166
    # Check to see if the problem can be split in two.
167
    hm = self.diff_halfMatch(text1, text2)
168
    if hm:
169
      # A half-match was found, sort out the return data.
170
      (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
171
      # Send both pairs off for separate processing.
172
      diffs_a = self.diff_main(text1_a, text2_a, checklines)
173
      diffs_b = self.diff_main(text1_b, text2_b, checklines)
174
      # Merge the results.
175
      return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
176
177
    # Perform a real diff.
178
    if checklines and (len(text1) < 100 or len(text2) < 100):
179
      checklines = False  # Too trivial for the overhead.
180
    if checklines:
181
      # Scan the text on a line-by-line basis first.
182
      (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
183
184
    diffs = self.diff_map(text1, text2)
185
    if not diffs:  # No acceptable result.
186
      diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
187
    if checklines:
188
      # Convert the diff back to original text.
189
      self.diff_charsToLines(diffs, linearray)
190
      # Eliminate freak matches (e.g. blank lines)
191
      self.diff_cleanupSemantic(diffs)
192
193
      # Rediff any replacement blocks, this time character-by-character.
194
      # Add a dummy entry at the end.
195
      diffs.append((self.DIFF_EQUAL, ''))
196
      pointer = 0
197
      count_delete = 0
198
      count_insert = 0
199
      text_delete = ''
200
      text_insert = ''
201
      while pointer < len(diffs):
202
        if diffs[pointer][0] == self.DIFF_INSERT:
203
          count_insert += 1
204
          text_insert += diffs[pointer][1]
205
        elif diffs[pointer][0] == self.DIFF_DELETE:
206
          count_delete += 1
207
          text_delete += diffs[pointer][1]
208
        elif diffs[pointer][0] == self.DIFF_EQUAL:
209
          # Upon reaching an equality, check for prior redundancies.
210
          if count_delete >= 1 and count_insert >= 1:
211
            # Delete the offending records and add the merged ones.
212
            a = self.diff_main(text_delete, text_insert, False)
213
            diffs[pointer - count_delete - count_insert : pointer] = a
214
            pointer = pointer - count_delete - count_insert + len(a)
215
          count_insert = 0
216
          count_delete = 0
217
          text_delete = ''
218
          text_insert = ''
219
220
        pointer += 1
221
222
      diffs.pop()  # Remove the dummy entry at the end.
223
    return diffs
224
225
  def diff_linesToChars(self, text1, text2):
226
    """Split two texts into an array of strings.  Reduce the texts to a string
227
    of hashes where each Unicode character represents one line.
228
229
    Args:
230
      text1: First string.
231
      text2: Second string.
232
233
    Returns:
234
      Three element tuple, containing the encoded text1, the encoded text2 and
235
      the array of unique strings.  The zeroth element of the array of unique
236
      strings is intentionally blank.
237
    """
238
    lineArray = []  # e.g. lineArray[4] == "Hello\n"
239
    lineHash = {}   # e.g. lineHash["Hello\n"] == 4
240
241
    # "\x00" is a valid character, but various debuggers don't like it.
242
    # So we'll insert a junk entry to avoid generating a null character.
243
    lineArray.append('')
244
245
    def diff_linesToCharsMunge(text):
246
      """Split a text into an array of strings.  Reduce the texts to a string
247
      of hashes where each Unicode character represents one line.
248
      Modifies linearray and linehash through being a closure.
249
250
      Args:
251
        text: String to encode.
252
253
      Returns:
254
        Encoded string.
255
      """
256
      chars = []
257
      # Walk the text, pulling out a substring for each line.
258
      # text.split('\n') would would temporarily double our memory footprint.
259
      # Modifying text would create many large strings to garbage collect.
260
      lineStart = 0
261
      lineEnd = -1
262
      while lineEnd < len(text) - 1:
263
        lineEnd = text.find('\n', lineStart)
264
        if lineEnd == -1:
265
          lineEnd = len(text) - 1
266
        line = text[lineStart:lineEnd + 1]
267
        lineStart = lineEnd + 1
268
269
        if line in lineHash:
270
          chars.append(unichr(lineHash[line]))
271
        else:
272
          lineArray.append(line)
273
          lineHash[line] = len(lineArray) - 1
274
          chars.append(unichr(len(lineArray) - 1))
275
      return "".join(chars)
276
277
    chars1 = diff_linesToCharsMunge(text1)
278
    chars2 = diff_linesToCharsMunge(text2)
279
    return (chars1, chars2, lineArray)
280
281
  def diff_charsToLines(self, diffs, lineArray):
282
    """Rehydrate the text in a diff from a string of line hashes to real lines
283
    of text.
284
285
    Args:
286
      diffs: Array of diff tuples.
287
      lineArray: Array of unique strings.
288
    """
289
    for x in xrange(len(diffs)):
290
      text = []
291
      for char in diffs[x][1]:
292
        text.append(lineArray[ord(char)])
293
      diffs[x] = (diffs[x][0], "".join(text))
294
295
  def diff_map(self, text1, text2):
296
    """Explore the intersection points between the two texts.
297
298
    Args:
299
      text1: Old string to be diffed.
300
      text2: New string to be diffed.
301
302
    Returns:
303
      Array of diff tuples or None if no diff available.
304
    """
305
306
    # Unlike in most languages, Python counts time in seconds.
307
    s_end = time.time() + self.Diff_Timeout  # Don't run for too long.
308
    # Cache the text lengths to prevent multiple calls.
309
    text1_length = len(text1)
310
    text2_length = len(text2)
311
    max_d = text1_length + text2_length - 1
312
    doubleEnd = self.Diff_DualThreshold * 2 < max_d
313
    v_map1 = []
314
    v_map2 = []
315
    v1 = {}
316
    v2 = {}
317
    v1[1] = 0
318
    v2[1] = 0
319
    footsteps = {}
320
    done = False
321
    # If the total number of characters is odd, then the front path will
322
    # collide with the reverse path.
323
    front = (text1_length + text2_length) % 2
324
    for d in xrange(max_d):
325
      # Bail out if timeout reached.
326
      if self.Diff_Timeout > 0 and time.time() > s_end:
327
        return None
328
329
      # Walk the front path one step.
330
      v_map1.append({})
331
      for k in xrange(-d, d + 1, 2):
332
        if k == -d or k != d and v1[k - 1] < v1[k + 1]:
333
          x = v1[k + 1]
334
        else:
335
          x = v1[k - 1] + 1
336
        y = x - k
337
        if doubleEnd:
338
          footstep = (x, y)
339
          if front and footstep in footsteps:
340
            done = True
341
          if not front:
342
            footsteps[footstep] = d
343
344
        while (not done and x < text1_length and y < text2_length and
345
               text1[x] == text2[y]):
346
          x += 1
347
          y += 1
348
          if doubleEnd:
349
            footstep = (x, y)
350
            if front and footstep in footsteps:
351
              done = True
352
            if not front:
353
              footsteps[footstep] = d
354
355
        v1[k] = x
356
        v_map1[d][(x, y)] = True
357
        if x == text1_length and y == text2_length:
358
          # Reached the end in single-path mode.
359
          return self.diff_path1(v_map1, text1, text2)
360
        elif done:
361
          # Front path ran over reverse path.
362
          v_map2 = v_map2[:footsteps[footstep] + 1]
363
          a = self.diff_path1(v_map1, text1[:x], text2[:y])
364
          b = self.diff_path2(v_map2, text1[x:], text2[y:])
365
          return a + b
366
367
      if doubleEnd:
368
        # Walk the reverse path one step.
369
        v_map2.append({})
370
        for k in xrange(-d, d + 1, 2):
371
          if k == -d or k != d and v2[k - 1] < v2[k + 1]:
372
            x = v2[k + 1]
373
          else:
374
            x = v2[k - 1] + 1
375
          y = x - k
376
          footstep = (text1_length - x, text2_length - y)
377
          if not front and footstep in footsteps:
378
            done = True
379
          if front:
380
            footsteps[footstep] = d
381
          while (not done and x < text1_length and y < text2_length and
382
                 text1[-x - 1] == text2[-y - 1]):
383
            x += 1
384
            y += 1
385
            footstep = (text1_length - x, text2_length - y)
386
            if not front and footstep in footsteps:
387
              done = True
388
            if front:
389
              footsteps[footstep] = d
390
391
          v2[k] = x
392
          v_map2[d][(x, y)] = True
393
          if done:
394
            # Reverse path ran over front path.
395
            v_map1 = v_map1[:footsteps[footstep] + 1]
396
            a = self.diff_path1(v_map1, text1[:text1_length - x],
397
                                text2[:text2_length - y])
398
            b = self.diff_path2(v_map2, text1[text1_length - x:],
399
                                text2[text2_length - y:])
400
            return a + b
401
402
    # Number of diffs equals number of characters, no commonality at all.
403
    return None
404
405
  def diff_path1(self, v_map, text1, text2):
406
    """Work from the middle back to the start to determine the path.
407
408
    Args:
409
      v_map: Array of paths.
410
      text1: Old string fragment to be diffed.
411
      text2: New string fragment to be diffed.
412
413
    Returns:
414
      Array of diff tuples.
415
    """
416
    path = []
417
    x = len(text1)
418
    y = len(text2)
419
    last_op = None
420
    for d in xrange(len(v_map) - 2, -1, -1):
421
      while True:
422
        if (x - 1, y) in v_map[d]:
423
          x -= 1
424
          if last_op == self.DIFF_DELETE:
425
            path[0] = (self.DIFF_DELETE, text1[x] + path[0][1])
426
          else:
427
            path[:0] = [(self.DIFF_DELETE, text1[x])]
428
          last_op = self.DIFF_DELETE
429
          break
430
        elif (x, y - 1) in v_map[d]:
431
          y -= 1
432
          if last_op == self.DIFF_INSERT:
433
            path[0] = (self.DIFF_INSERT, text2[y] + path[0][1])
434
          else:
435
            path[:0] = [(self.DIFF_INSERT, text2[y])]
436
          last_op = self.DIFF_INSERT
437
          break
438
        else:
439
          x -= 1
440
          y -= 1
441
          assert text1[x] == text2[y], ("No diagonal.  " +
442
              "Can't happen. (diff_path1)")
443
          if last_op == self.DIFF_EQUAL:
444
            path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1])
445
          else:
446
            path[:0] = [(self.DIFF_EQUAL, text1[x])]
447
          last_op = self.DIFF_EQUAL
448
    return path
449
450
  def diff_path2(self, v_map, text1, text2):
451
    """Work from the middle back to the end to determine the path.
452
453
    Args:
454
      v_map: Array of paths.
455
      text1: Old string fragment to be diffed.
456
      text2: New string fragment to be diffed.
457
458
    Returns:
459
      Array of diff tuples.
460
    """
461
    path = []
462
    x = len(text1)
463
    y = len(text2)
464
    last_op = None
465
    for d in xrange(len(v_map) - 2, -1, -1):
466
      while True:
467
        if (x - 1, y) in v_map[d]:
468
          x -= 1
469
          if last_op == self.DIFF_DELETE:
470
            path[-1] = (self.DIFF_DELETE, path[-1][1] + text1[-x - 1])
471
          else:
472
            path.append((self.DIFF_DELETE, text1[-x - 1]))
473
          last_op = self.DIFF_DELETE
474
          break
475
        elif (x, y - 1) in v_map[d]:
476
          y -= 1
477
          if last_op == self.DIFF_INSERT:
478
            path[-1] = (self.DIFF_INSERT, path[-1][1] + text2[-y - 1])
479
          else:
480
            path.append((self.DIFF_INSERT, text2[-y - 1]))
481
          last_op = self.DIFF_INSERT
482
          break
483
        else:
484
          x -= 1
485
          y -= 1
486
          assert text1[-x - 1] == text2[-y - 1], ("No diagonal.  " +
487
              "Can't happen. (diff_path2)")
488
          if last_op == self.DIFF_EQUAL:
489
            path[-1] = (self.DIFF_EQUAL, path[-1][1] + text1[-x - 1])
490
          else:
491
            path.append((self.DIFF_EQUAL, text1[-x - 1]))
492
          last_op = self.DIFF_EQUAL
493
    return path
494
495
  def diff_commonPrefix(self, text1, text2):
496
    """Determine the common prefix of two strings.
497
498
    Args:
499
      text1: First string.
500
      text2: Second string.
501
502
    Returns:
503
      The number of characters common to the start of each string.
504
    """
505
    # Quick check for common null cases.
506
    if not text1 or not text2 or text1[0] != text2[0]:
507
      return 0
508
    # Binary search.
509
    # Performance analysis: http://neil.fraser.name/news/2007/10/09/
510
    pointermin = 0
511
    pointermax = min(len(text1), len(text2))
512
    pointermid = pointermax
513
    pointerstart = 0
514
    while pointermin < pointermid:
515
      if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
516
        pointermin = pointermid
517
        pointerstart = pointermin
518
      else:
519
        pointermax = pointermid
520
      pointermid = int((pointermax - pointermin) / 2 + pointermin)
521
    return pointermid
522
523
  def diff_commonSuffix(self, text1, text2):
524
    """Determine the common suffix of two strings.
525
526
    Args:
527
      text1: First string.
528
      text2: Second string.
529
530
    Returns:
531
      The number of characters common to the end of each string.
532
    """
533
    # Quick check for common null cases.
534
    if not text1 or not text2 or text1[-1] != text2[-1]:
535
      return 0
536
    # Binary search.
537
    # Performance analysis: http://neil.fraser.name/news/2007/10/09/
538
    pointermin = 0
539
    pointermax = min(len(text1), len(text2))
540
    pointermid = pointermax
541
    pointerend = 0
542
    while pointermin < pointermid:
543
      if (text1[-pointermid:len(text1) - pointerend] ==
544
          text2[-pointermid:len(text2) - pointerend]):
545
        pointermin = pointermid
546
        pointerend = pointermin
547
      else:
548
        pointermax = pointermid
549
      pointermid = int((pointermax - pointermin) / 2 + pointermin)
550
    return pointermid
551
552
  def diff_halfMatch(self, text1, text2):
553
    """Do the two texts share a substring which is at least half the length of
554
    the longer text?
555
556
    Args:
557
      text1: First string.
558
      text2: Second string.
559
560
    Returns:
561
      Five element Array, containing the prefix of text1, the suffix of text1,
562
      the prefix of text2, the suffix of text2 and the common middle.  Or None
563
      if there was no match.
564
    """
565
    if len(text1) > len(text2):
566
      (longtext, shorttext) = (text1, text2)
567
    else:
568
      (shorttext, longtext) = (text1, text2)
569
    if len(longtext) < 10 or len(shorttext) < 1:
570
      return None  # Pointless.
571
572
    def diff_halfMatchI(longtext, shorttext, i):
573
      """Does a substring of shorttext exist within longtext such that the
574
      substring is at least half the length of longtext?
575
      Closure, but does not reference any external variables.
576
577
      Args:
578
        longtext: Longer string.
579
        shorttext: Shorter string.
580
        i: Start index of quarter length substring within longtext.
581
582
      Returns:
583
        Five element Array, containing the prefix of longtext, the suffix of
584
        longtext, the prefix of shorttext, the suffix of shorttext and the
585
        common middle.  Or None if there was no match.
586
      """
587
      seed = longtext[i:i + len(longtext) / 4]
588
      best_common = ''
589
      j = shorttext.find(seed)
590
      while j != -1:
591
        prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
592
        suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
593
        if len(best_common) < suffixLength + prefixLength:
594
          best_common = (shorttext[j - suffixLength:j] +
595
              shorttext[j:j + prefixLength])
596
          best_longtext_a = longtext[:i - suffixLength]
597
          best_longtext_b = longtext[i + prefixLength:]
598
          best_shorttext_a = shorttext[:j - suffixLength]
599
          best_shorttext_b = shorttext[j + prefixLength:]
600
        j = shorttext.find(seed, j + 1)
601
602
      if len(best_common) >= len(longtext) / 2:
603
        return (best_longtext_a, best_longtext_b,
604
                best_shorttext_a, best_shorttext_b, best_common)
605
      else:
606
        return None
607
608
    # First check if the second quarter is the seed for a half-match.
609
    hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)
610
    # Check again based on the third quarter.
611
    hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)
612
    if not hm1 and not hm2:
613
      return None
614
    elif not hm2:
615
      hm = hm1
616
    elif not hm1:
617
      hm = hm2
618
    else:
619
      # Both matched.  Select the longest.
620
      if len(hm1[4]) > len(hm2[4]):
621
        hm = hm1
622
      else:
623
        hm = hm2
624
625
    # A half-match was found, sort out the return data.
626
    if len(text1) > len(text2):
627
      (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
628
    else:
629
      (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
630
    return (text1_a, text1_b, text2_a, text2_b, mid_common)
631
632
  def diff_cleanupSemantic(self, diffs):
633
    """Reduce the number of edits by eliminating semantically trivial
634
    equalities.
635
636
    Args:
637
      diffs: Array of diff tuples.
638
    """
639
    changes = False
640
    equalities = []  # Stack of indices where equalities are found.
641
    lastequality = None  # Always equal to equalities[-1][1]
642
    pointer = 0  # Index of current position.
643
    length_changes1 = 0  # Number of chars that changed prior to the equality.
644
    length_changes2 = 0  # Number of chars that changed after the equality.
645
    while pointer < len(diffs):
646
      if diffs[pointer][0] == self.DIFF_EQUAL:  # equality found
647
        equalities.append(pointer)
648
        length_changes1 = length_changes2
649
        length_changes2 = 0
650
        lastequality = diffs[pointer][1]
651
      else:  # an insertion or deletion
652
        length_changes2 += len(diffs[pointer][1])
653
        if (lastequality != None and (len(lastequality) <= length_changes1) and
654
            (len(lastequality) <= length_changes2)):
655
          # Duplicate record
656
          diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
657
          # Change second copy to insert.
658
          diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
659
              diffs[equalities[-1] + 1][1])
660
          # Throw away the equality we just deleted.
661
          equalities.pop()
662
          # Throw away the previous equality (it needs to be reevaluated).
663
          if len(equalities) != 0:
664
            equalities.pop()
665
          if len(equalities):
666
            pointer = equalities[-1]
667
          else:
668
            pointer = -1
669
          length_changes1 = 0  # Reset the counters.
670
          length_changes2 = 0
671
          lastequality = None
672
          changes = True
673
      pointer += 1
674
675
    if changes:
676
      self.diff_cleanupMerge(diffs)
677
678
    self.diff_cleanupSemanticLossless(diffs)
679
680
  def diff_cleanupSemanticLossless(self, diffs):
681
    """Look for single edits surrounded on both sides by equalities
682
    which can be shifted sideways to align the edit to a word boundary.
683
    e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
684
685
    Args:
686
      diffs: Array of diff tuples.
687
    """
688
689
    def diff_cleanupSemanticScore(one, two):
690
      """Given two strings, compute a score representing whether the
691
      internal boundary falls on logical boundaries.
692
      Scores range from 5 (best) to 0 (worst).
693
      Closure, but does not reference any external variables.
694
695
      Args:
696
        one: First string.
697
        two: Second string.
698
699
      Returns:
700
        The score.
701
      """
702
      if not one or not two:
703
        # Edges are the best.
704
        return 5
705
706
      # Each port of this function behaves slightly differently due to
707
      # subtle differences in each language's definition of things like
708
      # 'whitespace'.  Since this function's purpose is largely cosmetic,
709
      # the choice has been made to use each language's native features
710
      # rather than force total conformity.
711
      score = 0
712
      # One point for non-alphanumeric.
713
      if not one[-1].isalnum() or not two[0].isalnum():
714
        score += 1
715
        # Two points for whitespace.
716
        if one[-1].isspace() or two[0].isspace():
717
          score += 1
718
          # Three points for line breaks.
719
          if (one[-1] == "\r" or one[-1] == "\n" or
720
              two[0] == "\r" or two[0] == "\n"):
721
            score += 1
722
            # Four points for blank lines.
723
            if (re.search("\\n\\r?\\n$", one) or
724
                re.match("^\\r?\\n\\r?\\n", two)):
725
              score += 1
726
      return score
727
728
    pointer = 1
729
    # Intentionally ignore the first and last element (don't need checking).
730
    while pointer < len(diffs) - 1:
731
      if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
732
          diffs[pointer + 1][0] == self.DIFF_EQUAL):
733
        # This is a single edit surrounded by equalities.
734
        equality1 = diffs[pointer - 1][1]
735
        edit = diffs[pointer][1]
736
        equality2 = diffs[pointer + 1][1]
737
738
        # First, shift the edit as far left as possible.
739
        commonOffset = self.diff_commonSuffix(equality1, edit)
740
        if commonOffset:
741
          commonString = edit[-commonOffset:]
742
          equality1 = equality1[:-commonOffset]
743
          edit = commonString + edit[:-commonOffset]
744
          equality2 = commonString + equality2
745
746
        # Second, step character by character right, looking for the best fit.
747
        bestEquality1 = equality1
748
        bestEdit = edit
749
        bestEquality2 = equality2
750
        bestScore = (diff_cleanupSemanticScore(equality1, edit) +
751
            diff_cleanupSemanticScore(edit, equality2))
752
        while edit and equality2 and edit[0] == equality2[0]:
753
          equality1 += edit[0]
754
          edit = edit[1:] + equality2[0]
755
          equality2 = equality2[1:]
756
          score = (diff_cleanupSemanticScore(equality1, edit) +
757
              diff_cleanupSemanticScore(edit, equality2))
758
          # The >= encourages trailing rather than leading whitespace on edits.
759
          if score >= bestScore:
760
            bestScore = score
761
            bestEquality1 = equality1
762
            bestEdit = edit
763
            bestEquality2 = equality2
764
765
        if diffs[pointer - 1][1] != bestEquality1:
766
          # We have an improvement, save it back to the diff.
767
          if bestEquality1:
768
            diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
769
          else:
770
            del diffs[pointer - 1]
771
            pointer -= 1
772
          diffs[pointer] = (diffs[pointer][0], bestEdit)
773
          if bestEquality2:
774
            diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
775
          else:
776
            del diffs[pointer + 1]
777
            pointer -= 1
778
      pointer += 1
779
780
  def diff_cleanupEfficiency(self, diffs):
781
    """Reduce the number of edits by eliminating operationally trivial
782
    equalities.
783
784
    Args:
785
      diffs: Array of diff tuples.
786
    """
787
    changes = False
788
    equalities = []  # Stack of indices where equalities are found.
789
    lastequality = ''  # Always equal to equalities[-1][1]
790
    pointer = 0  # Index of current position.
791
    pre_ins = False  # Is there an insertion operation before the last equality.
792
    pre_del = False  # Is there a deletion operation before the last equality.
793
    post_ins = False  # Is there an insertion operation after the last equality.
794
    post_del = False  # Is there a deletion operation after the last equality.
795
    while pointer < len(diffs):
796
      if diffs[pointer][0] == self.DIFF_EQUAL:  # equality found
797
        if (len(diffs[pointer][1]) < self.Diff_EditCost and
798
            (post_ins or post_del)):
799
          # Candidate found.
800
          equalities.append(pointer)
801
          pre_ins = post_ins
802
          pre_del = post_del
803
          lastequality = diffs[pointer][1]
804
        else:
805
          # Not a candidate, and can never become one.
806
          equalities = []
807
          lastequality = ''
808
809
        post_ins = post_del = False
810
      else:  # an insertion or deletion
811
        if diffs[pointer][0] == self.DIFF_DELETE:
812
          post_del = True
813
        else:
814
          post_ins = True
815
816
        # Five types to be split:
817
        # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
818
        # <ins>A</ins>X<ins>C</ins><del>D</del>
819
        # <ins>A</ins><del>B</del>X<ins>C</ins>
820
        # <ins>A</del>X<ins>C</ins><del>D</del>
821
        # <ins>A</ins><del>B</del>X<del>C</del>
822
823
        if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
824
                             ((len(lastequality) < self.Diff_EditCost / 2) and
825
                              (pre_ins + pre_del + post_ins + post_del) == 3)):
826
          # Duplicate record
827
          diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
828
          # Change second copy to insert.
829
          diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
830
              diffs[equalities[-1] + 1][1])
831
          equalities.pop()  # Throw away the equality we just deleted
832
          lastequality = ''
833
          if pre_ins and pre_del:
834
            # No changes made which could affect previous entry, keep going.
835
            post_ins = post_del = True
836
            equalities = []
837
          else:
838
            if len(equalities):
839
              equalities.pop()  # Throw away the previous equality
840
            if len(equalities):
841
              pointer = equalities[-1]
842
            else:
843
              pointer = -1
844
            post_ins = post_del = False
845
          changes = True
846
      pointer += 1
847
848
    if changes:
849
      self.diff_cleanupMerge(diffs)
850
851
  def diff_cleanupMerge(self, diffs):
852
    """Reorder and merge like edit sections.  Merge equalities.
853
    Any edit section can move as long as it doesn't cross an equality.
854
855
    Args:
856
      diffs: Array of diff tuples.
857
    """
858
    diffs.append((self.DIFF_EQUAL, ''))  # Add a dummy entry at the end.
859
    pointer = 0
860
    count_delete = 0
861
    count_insert = 0
862
    text_delete = ''
863
    text_insert = ''
864
    while pointer < len(diffs):
865
      if diffs[pointer][0] == self.DIFF_INSERT:
866
        count_insert += 1
867
        text_insert += diffs[pointer][1]
868
        pointer += 1
869
      elif diffs[pointer][0] == self.DIFF_DELETE:
870
        count_delete += 1
871
        text_delete += diffs[pointer][1]
872
        pointer += 1
873
      elif diffs[pointer][0] == self.DIFF_EQUAL:
874
        # Upon reaching an equality, check for prior redundancies.
875
        if count_delete != 0 or count_insert != 0:
876
          if count_delete != 0 and count_insert != 0:
877
            # Factor out any common prefixies.
878
            commonlength = self.diff_commonPrefix(text_insert, text_delete)
879
            if commonlength != 0:
880
              x = pointer - count_delete - count_insert - 1
881
              if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
882
                diffs[x] = (diffs[x][0], diffs[x][1] +
883
                            text_insert[:commonlength])
884
              else:
885
                diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))
886
                pointer += 1
887
              text_insert = text_insert[commonlength:]
888
              text_delete = text_delete[commonlength:]
889
            # Factor out any common suffixies.
890
            commonlength = self.diff_commonSuffix(text_insert, text_delete)
891
            if commonlength != 0:
892
              diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
893
                  diffs[pointer][1])
894
              text_insert = text_insert[:-commonlength]
895
              text_delete = text_delete[:-commonlength]
896
          # Delete the offending records and add the merged ones.
897
          if count_delete == 0:
898
            diffs[pointer - count_insert : pointer] = [
899
                (self.DIFF_INSERT, text_insert)]
900
          elif count_insert == 0:
901
            diffs[pointer - count_delete : pointer] = [
902
                (self.DIFF_DELETE, text_delete)]
903
          else:
904
            diffs[pointer - count_delete - count_insert : pointer] = [
905
                (self.DIFF_DELETE, text_delete),
906
                (self.DIFF_INSERT, text_insert)]
907
          pointer = pointer - count_delete - count_insert + 1
908
          if count_delete != 0:
909
            pointer += 1
910
          if count_insert != 0:
911
            pointer += 1
912
        elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
913
          # Merge this equality with the previous one.
914
          diffs[pointer - 1] = (diffs[pointer - 1][0],
915
                                diffs[pointer - 1][1] + diffs[pointer][1])
916
          del diffs[pointer]
917
        else:
918
          pointer += 1
919
920
        count_insert = 0
921
        count_delete = 0
922
        text_delete = ''
923
        text_insert = ''
924
925
    if diffs[-1][1] == '':
926
      diffs.pop()  # Remove the dummy entry at the end.
927
928
    # Second pass: look for single edits surrounded on both sides by equalities
929
    # which can be shifted sideways to eliminate an equality.
930
    # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
931
    changes = False
932
    pointer = 1
933
    # Intentionally ignore the first and last element (don't need checking).
934
    while pointer < len(diffs) - 1:
935
      if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
936
          diffs[pointer + 1][0] == self.DIFF_EQUAL):
937
        # This is a single edit surrounded by equalities.
938
        if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
939
          # Shift the edit over the previous equality.
940
          diffs[pointer] = (diffs[pointer][0],
941
              diffs[pointer - 1][1] +
942
              diffs[pointer][1][:-len(diffs[pointer - 1][1])])
943
          diffs[pointer + 1] = (diffs[pointer + 1][0],
944
                                diffs[pointer - 1][1] + diffs[pointer + 1][1])
945
          del diffs[pointer - 1]
946
          changes = True
947
        elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
948
          # Shift the edit over the next equality.
949
          diffs[pointer - 1] = (diffs[pointer - 1][0],
950
                                diffs[pointer - 1][1] + diffs[pointer + 1][1])
951
          diffs[pointer] = (diffs[pointer][0],
952
              diffs[pointer][1][len(diffs[pointer + 1][1]):] +
953
              diffs[pointer + 1][1])
954
          del diffs[pointer + 1]
955
          changes = True
956
      pointer += 1
957
958
    # If shifts were made, the diff needs reordering and another shift sweep.
959
    if changes:
960
      self.diff_cleanupMerge(diffs)
961
962
  def diff_xIndex(self, diffs, loc):
963
    """loc is a location in text1, compute and return the equivalent location
964
    in text2.  e.g. "The cat" vs "The big cat", 1->1, 5->8
965
966
    Args:
967
      diffs: Array of diff tuples.
968
      loc: Location within text1.
969
970
    Returns:
971
      Location within text2.
972
    """
973
    chars1 = 0
974
    chars2 = 0
975
    last_chars1 = 0
976
    last_chars2 = 0
977
    for x in xrange(len(diffs)):
978
      (op, text) = diffs[x]
979
      if op != self.DIFF_INSERT:  # Equality or deletion.
980
        chars1 += len(text)
981
      if op != self.DIFF_DELETE:  # Equality or insertion.
982
        chars2 += len(text)
983
      if chars1 > loc:  # Overshot the location.
984
        break
985
      last_chars1 = chars1
986
      last_chars2 = chars2
987
988
    if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
989
      # The location was deleted.
990
      return last_chars2
991
    # Add the remaining len(character).
992
    return last_chars2 + (loc - last_chars1)
993
994
  def diff_prettyHtml(self, diffs):
995
    """Convert a diff array into a pretty HTML report.
996
997
    Args:
998
      diffs: Array of diff tuples.
999
1000
    Returns:
1001
      HTML representation.
1002
    """
1003
    html = []
1004
    i = 0
1005
    for (op, data) in diffs:
1006
      text = (data.replace("&", "&amp;").replace("<", "&lt;")
1007
                 .replace(">", "&gt;").replace("\n", "&para;<BR>"))
1008
      if op == self.DIFF_INSERT:
209.1.80 by janus at jhor
fix wiki edit history style, fix new style
1009
        html.append("<SPAN STYLE=\"color: #00FF00;\" TITLE=\"i=%i\">%s</SPAN>"
173.2.2 by Holger Rapp
Added diff_match_patch from google, because they do not provide a setup.py
1010
            % (i, text))
1011
      elif op == self.DIFF_DELETE:
209.1.80 by janus at jhor
fix wiki edit history style, fix new style
1012
        html.append("<SPAN STYLE=\"color: #FF0000;\" TITLE=\"i=%i\">%s</SPAN>"
173.2.2 by Holger Rapp
Added diff_match_patch from google, because they do not provide a setup.py
1013
            % (i, text))
1014
      elif op == self.DIFF_EQUAL:
1015
        html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text))
1016
      if op != self.DIFF_DELETE:
1017
        i += len(data)
1018
    return "".join(html)
1019
1020
  def diff_text1(self, diffs):
1021
    """Compute and return the source text (all equalities and deletions).
1022
1023
    Args:
1024
      diffs: Array of diff tuples.
1025
1026
    Returns:
1027
      Source text.
1028
    """
1029
    text = []
1030
    for (op, data) in diffs:
1031
      if op != self.DIFF_INSERT:
1032
        text.append(data)
1033
    return "".join(text)
1034
1035
  def diff_text2(self, diffs):
1036
    """Compute and return the destination text (all equalities and insertions).
1037
1038
    Args:
1039
      diffs: Array of diff tuples.
1040
1041
    Returns:
1042
      Destination text.
1043
    """
1044
    text = []
1045
    for (op, data) in diffs:
1046
      if op != self.DIFF_DELETE:
1047
        text.append(data)
1048
    return "".join(text)
1049
1050
  def diff_levenshtein(self, diffs):
1051
    """Compute the Levenshtein distance; the number of inserted, deleted or
1052
    substituted characters.
1053
1054
    Args:
1055
      diffs: Array of diff tuples.
1056
1057
    Returns:
1058
      Number of changes.
1059
    """
1060
    levenshtein = 0
1061
    insertions = 0
1062
    deletions = 0
1063
    for (op, data) in diffs:
1064
      if op == self.DIFF_INSERT:
1065
        insertions += len(data)
1066
      elif op == self.DIFF_DELETE:
1067
        deletions += len(data)
1068
      elif op == self.DIFF_EQUAL:
1069
        # A deletion and an insertion is one substitution.
1070
        levenshtein += max(insertions, deletions)
1071
        insertions = 0
1072
        deletions = 0
1073
    levenshtein += max(insertions, deletions)
1074
    return levenshtein
1075
1076
  def diff_toDelta(self, diffs):
1077
    """Crush the diff into an encoded string which describes the operations
1078
    required to transform text1 into text2.
1079
    E.g. =3\t-2\t+ing  -> Keep 3 chars, delete 2 chars, insert 'ing'.
1080
    Operations are tab-separated.  Inserted text is escaped using %xx notation.
1081
1082
    Args:
1083
      diffs: Array of diff tuples.
1084
1085
    Returns:
1086
      Delta text.
1087
    """
1088
    text = []
1089
    for (op, data) in diffs:
1090
      if op == self.DIFF_INSERT:
1091
        # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
1092
        data = data.encode("utf-8")
1093
        text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))
1094
      elif op == self.DIFF_DELETE:
1095
        text.append("-%d" % len(data))
1096
      elif op == self.DIFF_EQUAL:
1097
        text.append("=%d" % len(data))
1098
    return "\t".join(text)
1099
1100
  def diff_fromDelta(self, text1, delta):
1101
    """Given the original text1, and an encoded string which describes the
1102
    operations required to transform text1 into text2, compute the full diff.
1103
1104
    Args:
1105
      text1: Source string for the diff.
1106
      delta: Delta text.
1107
1108
    Returns:
1109
      Array of diff tuples.
1110
1111
    Raises:
1112
      ValueError: If invalid input.
1113
    """
1114
    if type(delta) == unicode:
1115
      # Deltas should be composed of a subset of ascii chars, Unicode not
1116
      # required.  If this encode raises UnicodeEncodeError, delta is invalid.
1117
      delta = delta.encode("ascii")
1118
    diffs = []
1119
    pointer = 0  # Cursor in text1
1120
    tokens = delta.split("\t")
1121
    for token in tokens:
1122
      if token == "":
1123
        # Blank tokens are ok (from a trailing \t).
1124
        continue
1125
      # Each token begins with a one character parameter which specifies the
1126
      # operation of this token (delete, insert, equality).
1127
      param = token[1:]
1128
      if token[0] == "+":
1129
        param = urllib.unquote(param).decode("utf-8")
1130
        diffs.append((self.DIFF_INSERT, param))
1131
      elif token[0] == "-" or token[0] == "=":
1132
        try:
1133
          n = int(param)
1134
        except ValueError:
1135
          raise ValueError, "Invalid number in diff_fromDelta: " + param
1136
        if n < 0:
1137
          raise ValueError, "Negative number in diff_fromDelta: " + param
1138
        text = text1[pointer : pointer + n]
1139
        pointer += n
1140
        if token[0] == "=":
1141
          diffs.append((self.DIFF_EQUAL, text))
1142
        else:
1143
          diffs.append((self.DIFF_DELETE, text))
1144
      else:
1145
        # Anything else is an error.
1146
        raise ValueError, ("Invalid diff operation in diff_fromDelta: " +
1147
            token[0])
1148
    if pointer != len(text1):
1149
      raise ValueError, (
1150
          "Delta length (%d) does not equal source text length (%d)." %
1151
         (pointer, len(text1)))
1152
    return diffs
1153
1154
  #  MATCH FUNCTIONS
1155
1156
  def match_main(self, text, pattern, loc):
1157
    """Locate the best instance of 'pattern' in 'text' near 'loc'.
1158
1159
    Args:
1160
      text: The text to search.
1161
      pattern: The pattern to search for.
1162
      loc: The location to search around.
1163
1164
    Returns:
1165
      Best match index or -1.
1166
    """
1167
    loc = max(0, min(loc, len(text)))
1168
    if text == pattern:
1169
      # Shortcut (potentially not guaranteed by the algorithm)
1170
      return 0
1171
    elif not text:
1172
      # Nothing to match.
1173
      return -1
1174
    elif text[loc:loc + len(pattern)] == pattern:
1175
      # Perfect match at the perfect spot!  (Includes case of null pattern)
1176
      return loc
1177
    else:
1178
      # Do a fuzzy compare.
1179
      match = self.match_bitap(text, pattern, loc)
1180
      return match
1181
1182
  def match_bitap(self, text, pattern, loc):
1183
    """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1184
    Bitap algorithm.
1185
1186
    Args:
1187
      text: The text to search.
1188
      pattern: The pattern to search for.
1189
      loc: The location to search around.
1190
1191
    Returns:
1192
      Best match index or -1.
1193
    """
1194
    # Python doesn't have a maxint limit, so ignore this check.
1195
    #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1196
    #  raise ValueError("Pattern too long for this application.")
1197
1198
    # Initialise the alphabet.
1199
    s = self.match_alphabet(pattern)
1200
1201
    def match_bitapScore(e, x):
1202
      """Compute and return the score for a match with e errors and x location.
1203
      Accesses loc and pattern through being a closure.
1204
1205
      Args:
1206
        e: Number of errors in match.
1207
        x: Location of match.
1208
1209
      Returns:
1210
        Overall score for match (0.0 = good, 1.0 = bad).
1211
      """
1212
      accuracy = float(e) / len(pattern)
1213
      proximity = abs(loc - x)
1214
      if not self.Match_Distance:
1215
        # Dodge divide by zero error.
1216
        return proximity and 1.0 or accuracy
1217
      return accuracy + (proximity / float(self.Match_Distance))
1218
1219
    # Highest score beyond which we give up.
1220
    score_threshold = self.Match_Threshold
1221
    # Is there a nearby exact match? (speedup)
1222
    best_loc = text.find(pattern, loc)
1223
    if best_loc != -1:
1224
      score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1225
      # What about in the other direction? (speedup)
1226
      best_loc = text.rfind(pattern, loc + len(pattern))
1227
      if best_loc != -1:
1228
        score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1229
1230
    # Initialise the bit arrays.
1231
    matchmask = 1 << (len(pattern) - 1)
1232
    best_loc = -1
1233
1234
    bin_max = len(pattern) + len(text)
1235
    # Empty initialization added to appease pychecker.
1236
    last_rd = None
1237
    for d in xrange(len(pattern)):
1238
      # Scan for the best match each iteration allows for one more error.
1239
      # Run a binary search to determine how far from 'loc' we can stray at
1240
      # this error level.
1241
      bin_min = 0
1242
      bin_mid = bin_max
1243
      while bin_min < bin_mid:
1244
        if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1245
          bin_min = bin_mid
1246
        else:
1247
          bin_max = bin_mid
1248
        bin_mid = (bin_max - bin_min) / 2 + bin_min
1249
1250
      # Use the result from this iteration as the maximum for the next.
1251
      bin_max = bin_mid
1252
      start = max(1, loc - bin_mid + 1)
1253
      finish = min(loc + bin_mid, len(text)) + len(pattern)
1254
1255
      rd = range(finish + 1)
1256
      rd.append((1 << d) - 1)
1257
      for j in xrange(finish, start - 1, -1):
1258
        if len(text) <= j - 1:
1259
          # Out of range.
1260
          charMatch = 0
1261
        else:
1262
          charMatch = s.get(text[j - 1], 0)
1263
        if d == 0:  # First pass: exact match.
1264
          rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1265
        else:  # Subsequent passes: fuzzy match.
1266
          rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (
1267
              ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1268
        if rd[j] & matchmask:
1269
          score = match_bitapScore(d, j - 1)
1270
          # This match will almost certainly be better than any existing match.
1271
          # But check anyway.
1272
          if score <= score_threshold:
1273
            # Told you so.
1274
            score_threshold = score
1275
            best_loc = j - 1
1276
            if best_loc > loc:
1277
              # When passing loc, don't exceed our current distance from loc.
1278
              start = max(1, 2 * loc - best_loc)
1279
            else:
1280
              # Already passed loc, downhill from here on in.
1281
              break
1282
      # No hope for a (better) match at greater error levels.
1283
      if match_bitapScore(d + 1, loc) > score_threshold:
1284
        break
1285
      last_rd = rd
1286
    return best_loc
1287
1288
  def match_alphabet(self, pattern):
1289
    """Initialise the alphabet for the Bitap algorithm.
1290
1291
    Args:
1292
      pattern: The text to encode.
1293
1294
    Returns:
1295
      Hash of character locations.
1296
    """
1297
    s = {}
1298
    for char in pattern:
1299
      s[char] = 0
1300
    for i in xrange(len(pattern)):
1301
      s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1302
    return s
1303
1304
  #  PATCH FUNCTIONS
1305
1306
  def patch_addContext(self, patch, text):
1307
    """Increase the context until it is unique,
1308
    but don't let the pattern expand beyond Match_MaxBits.
1309
1310
    Args:
1311
      patch: The patch to grow.
1312
      text: Source text.
1313
    """
1314
    if len(text) == 0:
1315
      return
1316
    pattern = text[patch.start2 : patch.start2 + patch.length1]
1317
    padding = 0
1318
1319
    # Look for the first and last matches of pattern in text.  If two different
1320
    # matches are found, increase the pattern length.
1321
    while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
1322
        0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
1323
        self.Patch_Margin)):
1324
      padding += self.Patch_Margin
1325
      pattern = text[max(0, patch.start2 - padding) :
1326
                     patch.start2 + patch.length1 + padding]
1327
    # Add one chunk for good luck.
1328
    padding += self.Patch_Margin
1329
1330
    # Add the prefix.
1331
    prefix = text[max(0, patch.start2 - padding) : patch.start2]
1332
    if prefix:
1333
      patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1334
    # Add the suffix.
1335
    suffix = text[patch.start2 + patch.length1 :
1336
                  patch.start2 + patch.length1 + padding]
1337
    if suffix:
1338
      patch.diffs.append((self.DIFF_EQUAL, suffix))
1339
1340
    # Roll back the start points.
1341
    patch.start1 -= len(prefix)
1342
    patch.start2 -= len(prefix)
1343
    # Extend lengths.
1344
    patch.length1 += len(prefix) + len(suffix)
1345
    patch.length2 += len(prefix) + len(suffix)
1346
1347
  def patch_make(self, a, b=None, c=None):
1348
    """Compute a list of patches to turn text1 into text2.
1349
    Use diffs if provided, otherwise compute it ourselves.
1350
    There are four ways to call this function, depending on what data is
1351
    available to the caller:
1352
    Method 1:
1353
    a = text1, b = text2
1354
    Method 2:
1355
    a = diffs
1356
    Method 3 (optimal):
1357
    a = text1, b = diffs
1358
    Method 4 (deprecated, use method 3):
1359
    a = text1, b = text2, c = diffs
1360
1361
    Args:
1362
      a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1363
          text2 (method 2).
1364
      b: text2 (methods 1,4) or Array of diff tuples for text1 to
1365
          text2 (method 3) or undefined (method 2).
1366
      c: Array of diff tuples for text1 to text2 (method 4) or
1367
          undefined (methods 1,2,3).
1368
1369
    Returns:
1370
      Array of patch objects.
1371
    """
1372
    text1 = None
1373
    diffs = None
1374
    # Note that texts may arrive as 'str' or 'unicode'.
1375
    if isinstance(a, basestring) and isinstance(b, basestring) and c is None:
1376
      # Method 1: text1, text2
1377
      # Compute diffs from text1 and text2.
1378
      text1 = a
1379
      diffs = self.diff_main(text1, b, True)
1380
      if len(diffs) > 2:
1381
        self.diff_cleanupSemantic(diffs)
1382
        self.diff_cleanupEfficiency(diffs)
1383
    elif isinstance(a, list) and b is None and c is None:
1384
      # Method 2: diffs
1385
      # Compute text1 from diffs.
1386
      diffs = a
1387
      text1 = self.diff_text1(diffs)
1388
    elif isinstance(a, basestring) and isinstance(b, list) and c is None:
1389
      # Method 3: text1, diffs
1390
      text1 = a
1391
      diffs = b
1392
    elif (isinstance(a, basestring) and isinstance(b, basestring) and
1393
          isinstance(c, list)):
1394
      # Method 4: text1, text2, diffs
1395
      # text2 is not used.
1396
      text1 = a
1397
      diffs = c
1398
    else:
1399
      raise ValueError("Unknown call format to patch_make.")
1400
1401
    if not diffs:
1402
      return []  # Get rid of the None case.
1403
    patches = []
1404
    patch = patch_obj()
1405
    char_count1 = 0  # Number of characters into the text1 string.
1406
    char_count2 = 0  # Number of characters into the text2 string.
1407
    prepatch_text = text1  # Recreate the patches to determine context info.
1408
    postpatch_text = text1
1409
    for x in xrange(len(diffs)):
1410
      (diff_type, diff_text) = diffs[x]
1411
      if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1412
        # A new patch starts here.
1413
        patch.start1 = char_count1
1414
        patch.start2 = char_count2
1415
      if diff_type == self.DIFF_INSERT:
1416
        # Insertion
1417
        patch.diffs.append(diffs[x])
1418
        patch.length2 += len(diff_text)
1419
        postpatch_text = (postpatch_text[:char_count2] + diff_text +
1420
                          postpatch_text[char_count2:])
1421
      elif diff_type == self.DIFF_DELETE:
1422
        # Deletion.
1423
        patch.length1 += len(diff_text)
1424
        patch.diffs.append(diffs[x])
1425
        postpatch_text = (postpatch_text[:char_count2] +
1426
                          postpatch_text[char_count2 + len(diff_text):])
1427
      elif (diff_type == self.DIFF_EQUAL and
1428
            len(diff_text) <= 2 * self.Patch_Margin and
1429
            len(patch.diffs) != 0 and len(diffs) != x + 1):
1430
        # Small equality inside a patch.
1431
        patch.diffs.append(diffs[x])
1432
        patch.length1 += len(diff_text)
1433
        patch.length2 += len(diff_text)
1434
1435
      if (diff_type == self.DIFF_EQUAL and
1436
          len(diff_text) >= 2 * self.Patch_Margin):
1437
        # Time for a new patch.
1438
        if len(patch.diffs) != 0:
1439
          self.patch_addContext(patch, prepatch_text)
1440
          patches.append(patch)
1441
          patch = patch_obj()
1442
          # Unlike Unidiff, our patch lists have a rolling context.
1443
          # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1444
          # Update prepatch text & pos to reflect the application of the
1445
          # just completed patch.
1446
          prepatch_text = postpatch_text
1447
          char_count1 = char_count2
1448
1449
      # Update the current character count.
1450
      if diff_type != self.DIFF_INSERT:
1451
        char_count1 += len(diff_text)
1452
      if diff_type != self.DIFF_DELETE:
1453
        char_count2 += len(diff_text)
1454
1455
    # Pick up the leftover patch if not empty.
1456
    if len(patch.diffs) != 0:
1457
      self.patch_addContext(patch, prepatch_text)
1458
      patches.append(patch)
1459
    return patches
1460
1461
  def patch_deepCopy(self, patches):
1462
    """Given an array of patches, return another array that is identical.
1463
1464
    Args:
1465
      patches: Array of patch objects.
1466
1467
    Returns:
1468
      Array of patch objects.
1469
    """
1470
    patchesCopy = []
1471
    for patch in patches:
1472
      patchCopy = patch_obj()
1473
      # No need to deep copy the tuples since they are immutable.
1474
      patchCopy.diffs = patch.diffs[:]
1475
      patchCopy.start1 = patch.start1
1476
      patchCopy.start2 = patch.start2
1477
      patchCopy.length1 = patch.length1
1478
      patchCopy.length2 = patch.length2
1479
      patchesCopy.append(patchCopy)
1480
    return patchesCopy
1481
1482
  def patch_apply(self, patches, text):
1483
    """Merge a set of patches onto the text.  Return a patched text, as well
1484
    as a list of true/false values indicating which patches were applied.
1485
1486
    Args:
1487
      patches: Array of patch objects.
1488
      text: Old text.
1489
1490
    Returns:
1491
      Two element Array, containing the new text and an array of boolean values.
1492
    """
1493
    if not patches:
1494
      return (text, [])
1495
1496
    # Deep copy the patches so that no changes are made to originals.
1497
    patches = self.patch_deepCopy(patches)
1498
1499
    nullPadding = self.patch_addPadding(patches)
1500
    text = nullPadding + text + nullPadding
1501
    self.patch_splitMax(patches)
1502
1503
    # delta keeps track of the offset between the expected and actual location
1504
    # of the previous patch.  If there are patches expected at positions 10 and
1505
    # 20, but the first patch was found at 12, delta is 2 and the second patch
1506
    # has an effective expected position of 22.
1507
    delta = 0
1508
    results = []
1509
    for patch in patches:
1510
      expected_loc = patch.start2 + delta
1511
      text1 = self.diff_text1(patch.diffs)
1512
      end_loc = -1
1513
      if len(text1) > self.Match_MaxBits:
1514
        # patch_splitMax will only provide an oversized pattern in the case of
1515
        # a monster delete.
1516
        start_loc = self.match_main(text, text1[:self.Match_MaxBits],
1517
                                    expected_loc)
1518
        if start_loc != -1:
1519
          end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
1520
              expected_loc + len(text1) - self.Match_MaxBits)
1521
          if end_loc == -1 or start_loc >= end_loc:
1522
            # Can't find valid trailing context.  Drop this patch.
1523
            start_loc = -1
1524
      else:
1525
        start_loc = self.match_main(text, text1, expected_loc)
1526
      if start_loc == -1:
1527
        # No match found.  :(
1528
        results.append(False)
1529
        # Subtract the delta for this failed patch from subsequent patches.
1530
        delta -= patch.length2 - patch.length1
1531
      else:
1532
        # Found a match.  :)
1533
        results.append(True)
1534
        delta = start_loc - expected_loc
1535
        if end_loc == -1:
1536
          text2 = text[start_loc : start_loc + len(text1)]
1537
        else:
1538
          text2 = text[start_loc : end_loc + self.Match_MaxBits]
1539
        if text1 == text2:
1540
          # Perfect match, just shove the replacement text in.
1541
          text = (text[:start_loc] + self.diff_text2(patch.diffs) +
1542
                      text[start_loc + len(text1):])
1543
        else:
1544
          # Imperfect match.
1545
          # Run a diff to get a framework of equivalent indices.
1546
          diffs = self.diff_main(text1, text2, False)
1547
          if (len(text1) > self.Match_MaxBits and
1548
              self.diff_levenshtein(diffs) / float(len(text1)) >
1549
              self.Patch_DeleteThreshold):
1550
            # The end points match, but the content is unacceptably bad.
1551
            results[-1] = False
1552
          else:
1553
            self.diff_cleanupSemanticLossless(diffs)
1554
            index1 = 0
1555
            for (op, data) in patch.diffs:
1556
              if op != self.DIFF_EQUAL:
1557
                index2 = self.diff_xIndex(diffs, index1)
1558
              if op == self.DIFF_INSERT:  # Insertion
1559
                text = text[:start_loc + index2] + data + text[start_loc +
1560
                                                               index2:]
1561
              elif op == self.DIFF_DELETE:  # Deletion
1562
                text = text[:start_loc + index2] + text[start_loc +
1563
                    self.diff_xIndex(diffs, index1 + len(data)):]
1564
              if op != self.DIFF_DELETE:
1565
                index1 += len(data)
1566
    # Strip the padding off.
1567
    text = text[len(nullPadding):-len(nullPadding)]
1568
    return (text, results)
1569
1570
  def patch_addPadding(self, patches):
1571
    """Add some padding on text start and end so that edges can match
1572
    something.  Intended to be called only from within patch_apply.
1573
1574
    Args:
1575
      patches: Array of patch objects.
1576
1577
    Returns:
1578
      The padding string added to each side.
1579
    """
1580
    paddingLength = self.Patch_Margin
1581
    nullPadding = ""
1582
    for x in xrange(1, paddingLength + 1):
1583
      nullPadding += chr(x)
1584
1585
    # Bump all the patches forward.
1586
    for patch in patches:
1587
      patch.start1 += paddingLength
1588
      patch.start2 += paddingLength
1589
1590
    # Add some padding on start of first diff.
1591
    patch = patches[0]
1592
    diffs = patch.diffs
1593
    if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1594
      # Add nullPadding equality.
1595
      diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1596
      patch.start1 -= paddingLength  # Should be 0.
1597
      patch.start2 -= paddingLength  # Should be 0.
1598
      patch.length1 += paddingLength
1599
      patch.length2 += paddingLength
1600
    elif paddingLength > len(diffs[0][1]):
1601
      # Grow first equality.
1602
      extraLength = paddingLength - len(diffs[0][1])
1603
      newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
1604
      diffs[0] = (diffs[0][0], newText)
1605
      patch.start1 -= extraLength
1606
      patch.start2 -= extraLength
1607
      patch.length1 += extraLength
1608
      patch.length2 += extraLength
1609
1610
    # Add some padding on end of last diff.
1611
    patch = patches[-1]
1612
    diffs = patch.diffs
1613
    if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1614
      # Add nullPadding equality.
1615
      diffs.append((self.DIFF_EQUAL, nullPadding))
1616
      patch.length1 += paddingLength
1617
      patch.length2 += paddingLength
1618
    elif paddingLength > len(diffs[-1][1]):
1619
      # Grow last equality.
1620
      extraLength = paddingLength - len(diffs[-1][1])
1621
      newText = diffs[-1][1] + nullPadding[:extraLength]
1622
      diffs[-1] = (diffs[-1][0], newText)
1623
      patch.length1 += extraLength
1624
      patch.length2 += extraLength
1625
1626
    return nullPadding
1627
1628
  def patch_splitMax(self, patches):
1629
    """Look through the patches and break up any which are longer than the
1630
    maximum limit of the match algorithm.
1631
1632
    Args:
1633
      patches: Array of patch objects.
1634
    """
1635
    if self.Match_MaxBits == 0:
1636
      return
1637
    for x in xrange(len(patches)):
1638
      if patches[x].length1 > self.Match_MaxBits:
1639
        bigpatch = patches[x]
1640
        # Remove the big old patch.
1641
        del patches[x]
1642
        x -= 1
1643
        patch_size = self.Match_MaxBits
1644
        start1 = bigpatch.start1
1645
        start2 = bigpatch.start2
1646
        precontext = ''
1647
        while len(bigpatch.diffs) != 0:
1648
          # Create one of several smaller patches.
1649
          patch = patch_obj()
1650
          empty = True
1651
          patch.start1 = start1 - len(precontext)
1652
          patch.start2 = start2 - len(precontext)
1653
          if precontext:
1654
            patch.length1 = patch.length2 = len(precontext)
1655
            patch.diffs.append((self.DIFF_EQUAL, precontext))
1656
1657
          while (len(bigpatch.diffs) != 0 and
1658
                 patch.length1 < patch_size - self.Patch_Margin):
1659
            (diff_type, diff_text) = bigpatch.diffs[0]
1660
            if diff_type == self.DIFF_INSERT:
1661
              # Insertions are harmless.
1662
              patch.length2 += len(diff_text)
1663
              start2 += len(diff_text)
1664
              patch.diffs.append(bigpatch.diffs.pop(0))
1665
              empty = False
1666
            elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
1667
                patch.diffs[0][0] == self.DIFF_EQUAL and
1668
                len(diff_text) > 2 * patch_size):
1669
              # This is a large deletion.  Let it pass in one chunk.
1670
              patch.length1 += len(diff_text)
1671
              start1 += len(diff_text)
1672
              empty = False
1673
              patch.diffs.append((diff_type, diff_text))
1674
              del bigpatch.diffs[0]
1675
            else:
1676
              # Deletion or equality.  Only take as much as we can stomach.
1677
              diff_text = diff_text[:patch_size - patch.length1 -
1678
                                    self.Patch_Margin]
1679
              patch.length1 += len(diff_text)
1680
              start1 += len(diff_text)
1681
              if diff_type == self.DIFF_EQUAL:
1682
                patch.length2 += len(diff_text)
1683
                start2 += len(diff_text)
1684
              else:
1685
                empty = False
1686
1687
              patch.diffs.append((diff_type, diff_text))
1688
              if diff_text == bigpatch.diffs[0][1]:
1689
                del bigpatch.diffs[0]
1690
              else:
1691
                bigpatch.diffs[0] = (bigpatch.diffs[0][0],
1692
                                     bigpatch.diffs[0][1][len(diff_text):])
1693
1694
          # Compute the head context for the next patch.
1695
          precontext = self.diff_text2(patch.diffs)
1696
          precontext = precontext[-self.Patch_Margin:]
1697
          # Append the end context for this patch.
1698
          postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]
1699
          if postcontext:
1700
            patch.length1 += len(postcontext)
1701
            patch.length2 += len(postcontext)
1702
            if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1703
              patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
1704
                                 postcontext)
1705
            else:
1706
              patch.diffs.append((self.DIFF_EQUAL, postcontext))
1707
1708
          if not empty:
1709
            x += 1
1710
            patches.insert(x, patch)
1711
1712
  def patch_toText(self, patches):
1713
    """Take a list of patches and return a textual representation.
1714
1715
    Args:
1716
      patches: Array of patch objects.
1717
1718
    Returns:
1719
      Text representation of patches.
1720
    """
1721
    text = []
1722
    for patch in patches:
1723
      text.append(str(patch))
1724
    return "".join(text)
1725
1726
  def patch_fromText(self, textline):
1727
    """Parse a textual representation of patches and return a list of patch
1728
    objects.
1729
1730
    Args:
1731
      textline: Text representation of patches.
1732
1733
    Returns:
1734
      Array of patch objects.
1735
1736
    Raises:
1737
      ValueError: If invalid input.
1738
    """
1739
    if type(textline) == unicode:
1740
      # Patches should be composed of a subset of ascii chars, Unicode not
1741
      # required.  If this encode raises UnicodeEncodeError, patch is invalid.
1742
      textline = textline.encode("ascii")
1743
    patches = []
1744
    if not textline:
1745
      return patches
1746
    text = textline.split('\n')
1747
    while len(text) != 0:
1748
      m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1749
      if not m:
1750
        raise ValueError, "Invalid patch string: " + text[0]
1751
      patch = patch_obj()
1752
      patches.append(patch)
1753
      patch.start1 = int(m.group(1))
1754
      if m.group(2) == '':
1755
        patch.start1 -= 1
1756
        patch.length1 = 1
1757
      elif m.group(2) == '0':
1758
        patch.length1 = 0
1759
      else:
1760
        patch.start1 -= 1
1761
        patch.length1 = int(m.group(2))
1762
1763
      patch.start2 = int(m.group(3))
1764
      if m.group(4) == '':
1765
        patch.start2 -= 1
1766
        patch.length2 = 1
1767
      elif m.group(4) == '0':
1768
        patch.length2 = 0
1769
      else:
1770
        patch.start2 -= 1
1771
        patch.length2 = int(m.group(4))
1772
1773
      del text[0]
1774
1775
      while len(text) != 0:
1776
        if text[0]:
1777
          sign = text[0][0]
1778
        else:
1779
          sign = ''
1780
        line = urllib.unquote(text[0][1:])
1781
        line = line.decode("utf-8")
1782
        if sign == '+':
1783
          # Insertion.
1784
          patch.diffs.append((self.DIFF_INSERT, line))
1785
        elif sign == '-':
1786
          # Deletion.
1787
          patch.diffs.append((self.DIFF_DELETE, line))
1788
        elif sign == ' ':
1789
          # Minor equality.
1790
          patch.diffs.append((self.DIFF_EQUAL, line))
1791
        elif sign == '@':
1792
          # Start of next patch.
1793
          break
1794
        elif sign == '':
1795
          # Blank line?  Whatever.
1796
          pass
1797
        else:
1798
          # WTF?
1799
          raise ValueError, "Invalid patch mode: '%s'\n%s" % (sign, line)
1800
        del text[0]
1801
    return patches
1802
1803
1804
class patch_obj:
1805
  """Class representing one patch operation.
1806
  """
1807
1808
  def __init__(self):
1809
    """Initializes with an empty list of diffs.
1810
    """
1811
    self.diffs = []
1812
    self.start1 = None
1813
    self.start2 = None
1814
    self.length1 = 0
1815
    self.length2 = 0
1816
1817
  def __str__(self):
1818
    """Emmulate GNU diff's format.
1819
    Header: @@ -382,8 +481,9 @@
1820
    Indicies are printed as 1-based, not 0-based.
1821
1822
    Returns:
1823
      The GNU diff string.
1824
    """
1825
    if self.length1 == 0:
1826
      coords1 = str(self.start1) + ",0"
1827
    elif self.length1 == 1:
1828
      coords1 = str(self.start1 + 1)
1829
    else:
1830
      coords1 = str(self.start1 + 1) + "," + str(self.length1)
1831
    if self.length2 == 0:
1832
      coords2 = str(self.start2) + ",0"
1833
    elif self.length2 == 1:
1834
      coords2 = str(self.start2 + 1)
1835
    else:
1836
      coords2 = str(self.start2 + 1) + "," + str(self.length2)
1837
    text = ["@@ -", coords1, " +", coords2, " @@\n"]
1838
    # Escape the body of the patch with %xx notation.
1839
    for (op, data) in self.diffs:
1840
      if op == diff_match_patch.DIFF_INSERT:
1841
        text.append("+")
1842
      elif op == diff_match_patch.DIFF_DELETE:
1843
        text.append("-")
1844
      elif op == diff_match_patch.DIFF_EQUAL:
1845
        text.append(" ")
1846
      # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
1847
      data = data.encode("utf-8")
1848
      text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
1849
    return "".join(text)