36
38
class diff_match_patch:
37
"""Class containing the diff, match and patch methods.
39
Also contains the behaviour settings.
43
"""Inits a diff_match_patch object with default settings.
44
Redefine these in your program to override the defaults.
47
# Number of seconds to map a diff before giving up (0 for infinity).
48
self.Diff_Timeout = 1.0
49
# Cost of an empty edit operation in terms of edit characters.
50
self.Diff_EditCost = 4
51
# The size beyond which the double-ended diff activates.
52
# Double-ending is twice as fast, but less accurate.
53
self.Diff_DualThreshold = 32
54
# At what point is no match declared (0.0 = perfection, 1.0 = very loose).
55
self.Match_Threshold = 0.5
56
# How far to search for a match (0 = exact location, 1000+ = broad match).
57
# A match this many characters away from the expected location will add
58
# 1.0 to the score (0.0 is a perfect match).
59
self.Match_Distance = 1000
60
# When deleting a large block of text (over ~64 characters), how close does
61
# the contents have to match the expected contents. (0.0 = perfection,
62
# 1.0 = very loose). Note that Match_Threshold controls how closely the
63
# end points of a delete need to match.
64
self.Patch_DeleteThreshold = 0.5
65
# Chunk size for context length.
68
# How many bits in a number?
69
# Python has no maximum, thus to disable patch splitting set to 0.
70
# However to avoid long patches in certain pathological cases, use 32.
71
# Multiple short patches (using native ints) are much faster than long ones.
72
self.Match_MaxBits = 32
76
# The data structure representing a diff is an array of tuples:
77
# [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
78
# which means: delete "Hello", add "Goodbye" and keep " world."
83
def diff_main(self, text1, text2, checklines=True):
84
"""Find the differences between two texts. Simplifies the problem by
85
stripping any common prefix or suffix off the texts before diffing.
88
text1: Old string to be diffed.
89
text2: New string to be diffed.
90
checklines: Optional speedup flag. If present and false, then don't run
91
a line-level diff first to identify the changed areas.
92
Defaults to true, which does a faster, slightly less optimal diff.
98
# Check for equality (speedup)
100
return [(self.DIFF_EQUAL, text1)]
102
# Trim off common prefix (speedup)
103
commonlength = self.diff_commonPrefix(text1, text2)
104
commonprefix = text1[:commonlength]
105
text1 = text1[commonlength:]
106
text2 = text2[commonlength:]
108
# Trim off common suffix (speedup)
109
commonlength = self.diff_commonSuffix(text1, text2)
110
if commonlength == 0:
113
commonsuffix = text1[-commonlength:]
114
text1 = text1[:-commonlength]
115
text2 = text2[:-commonlength]
117
# Compute the diff on the middle block
118
diffs = self.diff_compute(text1, text2, checklines)
120
# Restore the prefix and suffix
122
diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
124
diffs.append((self.DIFF_EQUAL, commonsuffix))
125
self.diff_cleanupMerge(diffs)
128
def diff_compute(self, text1, text2, checklines):
129
"""Find the differences between two texts. Assumes that the texts do not
130
have any common prefix or suffix.
133
text1: Old string to be diffed.
134
text2: New string to be diffed.
135
checklines: Speedup flag. If false, then don't run a line-level diff
136
first to identify the changed areas.
137
If true, then run a faster, slightly less optimal diff.
143
# Just add some text (speedup)
144
return [(self.DIFF_INSERT, text2)]
147
# Just delete some text (speedup)
148
return [(self.DIFF_DELETE, text1)]
150
if len(text1) > len(text2):
151
(longtext, shorttext) = (text1, text2)
153
(shorttext, longtext) = (text1, text2)
154
i = longtext.find(shorttext)
156
# Shorter text is inside the longer text (speedup)
157
diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
158
(self.DIFF_INSERT, longtext[i + len(shorttext):])]
159
# Swap insertions for deletions if diff is reversed.
160
if len(text1) > len(text2):
161
diffs[0] = (self.DIFF_DELETE, diffs[0][1])
162
diffs[2] = (self.DIFF_DELETE, diffs[2][1])
164
longtext = shorttext = None # Garbage collect.
166
# Check to see if the problem can be split in two.
167
hm = self.diff_halfMatch(text1, text2)
169
# A half-match was found, sort out the return data.
170
(text1_a, text1_b, text2_a, text2_b, mid_common) = hm
171
# Send both pairs off for separate processing.
172
diffs_a = self.diff_main(text1_a, text2_a, checklines)
173
diffs_b = self.diff_main(text1_b, text2_b, checklines)
175
return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
177
# Perform a real diff.
178
if checklines and (len(text1) < 100 or len(text2) < 100):
179
checklines = False # Too trivial for the overhead.
181
# Scan the text on a line-by-line basis first.
182
(text1, text2, linearray) = self.diff_linesToChars(text1, text2)
184
diffs = self.diff_map(text1, text2)
185
if not diffs: # No acceptable result.
186
diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
188
# Convert the diff back to original text.
189
self.diff_charsToLines(diffs, linearray)
190
# Eliminate freak matches (e.g. blank lines)
191
self.diff_cleanupSemantic(diffs)
193
# Rediff any replacement blocks, this time character-by-character.
194
# Add a dummy entry at the end.
195
diffs.append((self.DIFF_EQUAL, ''))
201
while pointer < len(diffs):
202
if diffs[pointer][0] == self.DIFF_INSERT:
204
text_insert += diffs[pointer][1]
205
elif diffs[pointer][0] == self.DIFF_DELETE:
207
text_delete += diffs[pointer][1]
208
elif diffs[pointer][0] == self.DIFF_EQUAL:
209
# Upon reaching an equality, check for prior redundancies.
210
if count_delete >= 1 and count_insert >= 1:
211
# Delete the offending records and add the merged ones.
212
a = self.diff_main(text_delete, text_insert, False)
213
diffs[pointer - count_delete - count_insert : pointer] = a
214
pointer = pointer - count_delete - count_insert + len(a)
222
diffs.pop() # Remove the dummy entry at the end.
225
def diff_linesToChars(self, text1, text2):
226
"""Split two texts into an array of strings. Reduce the texts to a string
227
of hashes where each Unicode character represents one line.
231
text2: Second string.
234
Three element tuple, containing the encoded text1, the encoded text2 and
235
the array of unique strings. The zeroth element of the array of unique
236
strings is intentionally blank.
238
lineArray = [] # e.g. lineArray[4] == "Hello\n"
239
lineHash = {} # e.g. lineHash["Hello\n"] == 4
241
# "\x00" is a valid character, but various debuggers don't like it.
242
# So we'll insert a junk entry to avoid generating a null character.
245
def diff_linesToCharsMunge(text):
246
"""Split a text into an array of strings. Reduce the texts to a string
247
of hashes where each Unicode character represents one line.
248
Modifies linearray and linehash through being a closure.
251
text: String to encode.
257
# Walk the text, pulling out a substring for each line.
258
# text.split('\n') would would temporarily double our memory footprint.
259
# Modifying text would create many large strings to garbage collect.
262
while lineEnd < len(text) - 1:
263
lineEnd = text.find('\n', lineStart)
265
lineEnd = len(text) - 1
266
line = text[lineStart:lineEnd + 1]
267
lineStart = lineEnd + 1
270
chars.append(unichr(lineHash[line]))
272
lineArray.append(line)
273
lineHash[line] = len(lineArray) - 1
274
chars.append(unichr(len(lineArray) - 1))
275
return "".join(chars)
277
chars1 = diff_linesToCharsMunge(text1)
278
chars2 = diff_linesToCharsMunge(text2)
279
return (chars1, chars2, lineArray)
281
def diff_charsToLines(self, diffs, lineArray):
282
"""Rehydrate the text in a diff from a string of line hashes to real lines
286
diffs: Array of diff tuples.
287
lineArray: Array of unique strings.
289
for x in xrange(len(diffs)):
291
for char in diffs[x][1]:
292
text.append(lineArray[ord(char)])
293
diffs[x] = (diffs[x][0], "".join(text))
295
def diff_map(self, text1, text2):
296
"""Explore the intersection points between the two texts.
299
text1: Old string to be diffed.
300
text2: New string to be diffed.
303
Array of diff tuples or None if no diff available.
306
# Unlike in most languages, Python counts time in seconds.
307
s_end = time.time() + self.Diff_Timeout # Don't run for too long.
308
# Cache the text lengths to prevent multiple calls.
309
text1_length = len(text1)
310
text2_length = len(text2)
311
max_d = text1_length + text2_length - 1
312
doubleEnd = self.Diff_DualThreshold * 2 < max_d
321
# If the total number of characters is odd, then the front path will
322
# collide with the reverse path.
323
front = (text1_length + text2_length) % 2
324
for d in xrange(max_d):
325
# Bail out if timeout reached.
326
if self.Diff_Timeout > 0 and time.time() > s_end:
329
# Walk the front path one step.
331
for k in xrange(-d, d + 1, 2):
332
if k == -d or k != d and v1[k - 1] < v1[k + 1]:
339
if front and footstep in footsteps:
342
footsteps[footstep] = d
344
while (not done and x < text1_length and y < text2_length and
345
text1[x] == text2[y]):
350
if front and footstep in footsteps:
353
footsteps[footstep] = d
356
v_map1[d][(x, y)] = True
357
if x == text1_length and y == text2_length:
358
# Reached the end in single-path mode.
359
return self.diff_path1(v_map1, text1, text2)
361
# Front path ran over reverse path.
362
v_map2 = v_map2[:footsteps[footstep] + 1]
363
a = self.diff_path1(v_map1, text1[:x], text2[:y])
364
b = self.diff_path2(v_map2, text1[x:], text2[y:])
368
# Walk the reverse path one step.
370
for k in xrange(-d, d + 1, 2):
371
if k == -d or k != d and v2[k - 1] < v2[k + 1]:
376
footstep = (text1_length - x, text2_length - y)
377
if not front and footstep in footsteps:
380
footsteps[footstep] = d
381
while (not done and x < text1_length and y < text2_length and
382
text1[-x - 1] == text2[-y - 1]):
385
footstep = (text1_length - x, text2_length - y)
386
if not front and footstep in footsteps:
389
footsteps[footstep] = d
392
v_map2[d][(x, y)] = True
394
# Reverse path ran over front path.
395
v_map1 = v_map1[:footsteps[footstep] + 1]
396
a = self.diff_path1(v_map1, text1[:text1_length - x],
397
text2[:text2_length - y])
398
b = self.diff_path2(v_map2, text1[text1_length - x:],
399
text2[text2_length - y:])
402
# Number of diffs equals number of characters, no commonality at all.
405
def diff_path1(self, v_map, text1, text2):
406
"""Work from the middle back to the start to determine the path.
409
v_map: Array of paths.
410
text1: Old string fragment to be diffed.
411
text2: New string fragment to be diffed.
414
Array of diff tuples.
420
for d in xrange(len(v_map) - 2, -1, -1):
422
if (x - 1, y) in v_map[d]:
424
if last_op == self.DIFF_DELETE:
425
path[0] = (self.DIFF_DELETE, text1[x] + path[0][1])
427
path[:0] = [(self.DIFF_DELETE, text1[x])]
428
last_op = self.DIFF_DELETE
430
elif (x, y - 1) in v_map[d]:
432
if last_op == self.DIFF_INSERT:
433
path[0] = (self.DIFF_INSERT, text2[y] + path[0][1])
435
path[:0] = [(self.DIFF_INSERT, text2[y])]
436
last_op = self.DIFF_INSERT
441
assert text1[x] == text2[y], ("No diagonal. " +
442
"Can't happen. (diff_path1)")
443
if last_op == self.DIFF_EQUAL:
444
path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1])
446
path[:0] = [(self.DIFF_EQUAL, text1[x])]
447
last_op = self.DIFF_EQUAL
450
def diff_path2(self, v_map, text1, text2):
451
"""Work from the middle back to the end to determine the path.
454
v_map: Array of paths.
455
text1: Old string fragment to be diffed.
456
text2: New string fragment to be diffed.
459
Array of diff tuples.
465
for d in xrange(len(v_map) - 2, -1, -1):
467
if (x - 1, y) in v_map[d]:
469
if last_op == self.DIFF_DELETE:
470
path[-1] = (self.DIFF_DELETE, path[-1][1] + text1[-x - 1])
472
path.append((self.DIFF_DELETE, text1[-x - 1]))
473
last_op = self.DIFF_DELETE
475
elif (x, y - 1) in v_map[d]:
477
if last_op == self.DIFF_INSERT:
478
path[-1] = (self.DIFF_INSERT, path[-1][1] + text2[-y - 1])
480
path.append((self.DIFF_INSERT, text2[-y - 1]))
481
last_op = self.DIFF_INSERT
486
assert text1[-x - 1] == text2[-y - 1], ("No diagonal. " +
487
"Can't happen. (diff_path2)")
488
if last_op == self.DIFF_EQUAL:
489
path[-1] = (self.DIFF_EQUAL, path[-1][1] + text1[-x - 1])
491
path.append((self.DIFF_EQUAL, text1[-x - 1]))
492
last_op = self.DIFF_EQUAL
495
def diff_commonPrefix(self, text1, text2):
496
"""Determine the common prefix of two strings.
500
text2: Second string.
503
The number of characters common to the start of each string.
505
# Quick check for common null cases.
506
if not text1 or not text2 or text1[0] != text2[0]:
509
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
511
pointermax = min(len(text1), len(text2))
512
pointermid = pointermax
514
while pointermin < pointermid:
515
if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
516
pointermin = pointermid
517
pointerstart = pointermin
519
pointermax = pointermid
520
pointermid = int((pointermax - pointermin) / 2 + pointermin)
523
def diff_commonSuffix(self, text1, text2):
524
"""Determine the common suffix of two strings.
528
text2: Second string.
531
The number of characters common to the end of each string.
533
# Quick check for common null cases.
534
if not text1 or not text2 or text1[-1] != text2[-1]:
537
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
539
pointermax = min(len(text1), len(text2))
540
pointermid = pointermax
542
while pointermin < pointermid:
543
if (text1[-pointermid:len(text1) - pointerend] ==
544
text2[-pointermid:len(text2) - pointerend]):
545
pointermin = pointermid
546
pointerend = pointermin
548
pointermax = pointermid
549
pointermid = int((pointermax - pointermin) / 2 + pointermin)
552
def diff_halfMatch(self, text1, text2):
553
"""Do the two texts share a substring which is at least half the length of
558
text2: Second string.
561
Five element Array, containing the prefix of text1, the suffix of text1,
562
the prefix of text2, the suffix of text2 and the common middle. Or None
563
if there was no match.
565
if len(text1) > len(text2):
566
(longtext, shorttext) = (text1, text2)
568
(shorttext, longtext) = (text1, text2)
569
if len(longtext) < 10 or len(shorttext) < 1:
570
return None # Pointless.
572
def diff_halfMatchI(longtext, shorttext, i):
573
"""Does a substring of shorttext exist within longtext such that the
574
substring is at least half the length of longtext?
575
Closure, but does not reference any external variables.
578
longtext: Longer string.
579
shorttext: Shorter string.
580
i: Start index of quarter length substring within longtext.
583
Five element Array, containing the prefix of longtext, the suffix of
584
longtext, the prefix of shorttext, the suffix of shorttext and the
585
common middle. Or None if there was no match.
587
seed = longtext[i:i + len(longtext) / 4]
589
j = shorttext.find(seed)
591
prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
592
suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
593
if len(best_common) < suffixLength + prefixLength:
594
best_common = (shorttext[j - suffixLength:j] +
595
shorttext[j:j + prefixLength])
596
best_longtext_a = longtext[:i - suffixLength]
597
best_longtext_b = longtext[i + prefixLength:]
598
best_shorttext_a = shorttext[:j - suffixLength]
599
best_shorttext_b = shorttext[j + prefixLength:]
600
j = shorttext.find(seed, j + 1)
602
if len(best_common) >= len(longtext) / 2:
603
return (best_longtext_a, best_longtext_b,
604
best_shorttext_a, best_shorttext_b, best_common)
608
# First check if the second quarter is the seed for a half-match.
609
hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)
610
# Check again based on the third quarter.
611
hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)
612
if not hm1 and not hm2:
619
# Both matched. Select the longest.
620
if len(hm1[4]) > len(hm2[4]):
625
# A half-match was found, sort out the return data.
626
if len(text1) > len(text2):
627
(text1_a, text1_b, text2_a, text2_b, mid_common) = hm
629
(text2_a, text2_b, text1_a, text1_b, mid_common) = hm
630
return (text1_a, text1_b, text2_a, text2_b, mid_common)
632
def diff_cleanupSemantic(self, diffs):
633
"""Reduce the number of edits by eliminating semantically trivial
637
diffs: Array of diff tuples.
640
equalities = [] # Stack of indices where equalities are found.
641
lastequality = None # Always equal to equalities[-1][1]
642
pointer = 0 # Index of current position.
643
length_changes1 = 0 # Number of chars that changed prior to the equality.
644
length_changes2 = 0 # Number of chars that changed after the equality.
645
while pointer < len(diffs):
646
if diffs[pointer][0] == self.DIFF_EQUAL: # equality found
647
equalities.append(pointer)
648
length_changes1 = length_changes2
650
lastequality = diffs[pointer][1]
651
else: # an insertion or deletion
652
length_changes2 += len(diffs[pointer][1])
653
if (lastequality != None and (len(lastequality) <= length_changes1) and
654
(len(lastequality) <= length_changes2)):
656
diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
657
# Change second copy to insert.
658
diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
659
diffs[equalities[-1] + 1][1])
660
# Throw away the equality we just deleted.
662
# Throw away the previous equality (it needs to be reevaluated).
663
if len(equalities) != 0:
666
pointer = equalities[-1]
669
length_changes1 = 0 # Reset the counters.
676
self.diff_cleanupMerge(diffs)
678
self.diff_cleanupSemanticLossless(diffs)
680
def diff_cleanupSemanticLossless(self, diffs):
681
"""Look for single edits surrounded on both sides by equalities
682
which can be shifted sideways to align the edit to a word boundary.
683
e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
686
diffs: Array of diff tuples.
689
def diff_cleanupSemanticScore(one, two):
690
"""Given two strings, compute a score representing whether the
691
internal boundary falls on logical boundaries.
692
Scores range from 5 (best) to 0 (worst).
693
Closure, but does not reference any external variables.
702
if not one or not two:
703
# Edges are the best.
706
# Each port of this function behaves slightly differently due to
707
# subtle differences in each language's definition of things like
708
# 'whitespace'. Since this function's purpose is largely cosmetic,
709
# the choice has been made to use each language's native features
710
# rather than force total conformity.
712
# One point for non-alphanumeric.
713
if not one[-1].isalnum() or not two[0].isalnum():
715
# Two points for whitespace.
716
if one[-1].isspace() or two[0].isspace():
718
# Three points for line breaks.
719
if (one[-1] == "\r" or one[-1] == "\n" or
720
two[0] == "\r" or two[0] == "\n"):
722
# Four points for blank lines.
723
if (re.search("\\n\\r?\\n$", one) or
724
re.match("^\\r?\\n\\r?\\n", two)):
729
# Intentionally ignore the first and last element (don't need checking).
730
while pointer < len(diffs) - 1:
731
if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
732
diffs[pointer + 1][0] == self.DIFF_EQUAL):
733
# This is a single edit surrounded by equalities.
734
equality1 = diffs[pointer - 1][1]
735
edit = diffs[pointer][1]
736
equality2 = diffs[pointer + 1][1]
738
# First, shift the edit as far left as possible.
739
commonOffset = self.diff_commonSuffix(equality1, edit)
741
commonString = edit[-commonOffset:]
742
equality1 = equality1[:-commonOffset]
743
edit = commonString + edit[:-commonOffset]
744
equality2 = commonString + equality2
746
# Second, step character by character right, looking for the best fit.
747
bestEquality1 = equality1
749
bestEquality2 = equality2
750
bestScore = (diff_cleanupSemanticScore(equality1, edit) +
751
diff_cleanupSemanticScore(edit, equality2))
752
while edit and equality2 and edit[0] == equality2[0]:
754
edit = edit[1:] + equality2[0]
755
equality2 = equality2[1:]
756
score = (diff_cleanupSemanticScore(equality1, edit) +
757
diff_cleanupSemanticScore(edit, equality2))
758
# The >= encourages trailing rather than leading whitespace on edits.
759
if score >= bestScore:
761
bestEquality1 = equality1
763
bestEquality2 = equality2
765
if diffs[pointer - 1][1] != bestEquality1:
766
# We have an improvement, save it back to the diff.
768
diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
770
del diffs[pointer - 1]
772
diffs[pointer] = (diffs[pointer][0], bestEdit)
774
diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
776
del diffs[pointer + 1]
780
def diff_cleanupEfficiency(self, diffs):
781
"""Reduce the number of edits by eliminating operationally trivial
785
diffs: Array of diff tuples.
788
equalities = [] # Stack of indices where equalities are found.
789
lastequality = '' # Always equal to equalities[-1][1]
790
pointer = 0 # Index of current position.
791
pre_ins = False # Is there an insertion operation before the last equality.
792
pre_del = False # Is there a deletion operation before the last equality.
793
post_ins = False # Is there an insertion operation after the last equality.
794
post_del = False # Is there a deletion operation after the last equality.
795
while pointer < len(diffs):
796
if diffs[pointer][0] == self.DIFF_EQUAL: # equality found
797
if (len(diffs[pointer][1]) < self.Diff_EditCost and
798
(post_ins or post_del)):
800
equalities.append(pointer)
803
lastequality = diffs[pointer][1]
805
# Not a candidate, and can never become one.
809
post_ins = post_del = False
810
else: # an insertion or deletion
811
if diffs[pointer][0] == self.DIFF_DELETE:
816
# Five types to be split:
817
# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
818
# <ins>A</ins>X<ins>C</ins><del>D</del>
819
# <ins>A</ins><del>B</del>X<ins>C</ins>
820
# <ins>A</del>X<ins>C</ins><del>D</del>
821
# <ins>A</ins><del>B</del>X<del>C</del>
823
if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
824
((len(lastequality) < self.Diff_EditCost / 2) and
825
(pre_ins + pre_del + post_ins + post_del) == 3)):
827
diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))
828
# Change second copy to insert.
829
diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
830
diffs[equalities[-1] + 1][1])
831
equalities.pop() # Throw away the equality we just deleted
833
if pre_ins and pre_del:
834
# No changes made which could affect previous entry, keep going.
835
post_ins = post_del = True
839
equalities.pop() # Throw away the previous equality
841
pointer = equalities[-1]
844
post_ins = post_del = False
849
self.diff_cleanupMerge(diffs)
851
def diff_cleanupMerge(self, diffs):
852
"""Reorder and merge like edit sections. Merge equalities.
853
Any edit section can move as long as it doesn't cross an equality.
856
diffs: Array of diff tuples.
858
diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.
864
while pointer < len(diffs):
865
if diffs[pointer][0] == self.DIFF_INSERT:
867
text_insert += diffs[pointer][1]
869
elif diffs[pointer][0] == self.DIFF_DELETE:
871
text_delete += diffs[pointer][1]
873
elif diffs[pointer][0] == self.DIFF_EQUAL:
874
# Upon reaching an equality, check for prior redundancies.
875
if count_delete != 0 or count_insert != 0:
876
if count_delete != 0 and count_insert != 0:
877
# Factor out any common prefixies.
878
commonlength = self.diff_commonPrefix(text_insert, text_delete)
879
if commonlength != 0:
880
x = pointer - count_delete - count_insert - 1
881
if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
882
diffs[x] = (diffs[x][0], diffs[x][1] +
883
text_insert[:commonlength])
885
diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))
39
"""Class containing the diff, match and patch methods.
41
Also contains the behaviour settings.
46
"""Inits a diff_match_patch object with default settings.
48
Redefine these in your program to override the defaults.
52
# Number of seconds to map a diff before giving up (0 for infinity).
53
self.Diff_Timeout = 1.0
54
# Cost of an empty edit operation in terms of edit characters.
55
self.Diff_EditCost = 4
56
# The size beyond which the double-ended diff activates.
57
# Double-ending is twice as fast, but less accurate.
58
self.Diff_DualThreshold = 32
59
# At what point is no match declared (0.0 = perfection, 1.0 = very
61
self.Match_Threshold = 0.5
62
# How far to search for a match (0 = exact location, 1000+ = broad match).
63
# A match this many characters away from the expected location will add
64
# 1.0 to the score (0.0 is a perfect match).
65
self.Match_Distance = 1000
66
# When deleting a large block of text (over ~64 characters), how close does
67
# the contents have to match the expected contents. (0.0 = perfection,
68
# 1.0 = very loose). Note that Match_Threshold controls how closely the
69
# end points of a delete need to match.
70
self.Patch_DeleteThreshold = 0.5
71
# Chunk size for context length.
74
# How many bits in a number?
75
# Python has no maximum, thus to disable patch splitting set to 0.
76
# However to avoid long patches in certain pathological cases, use 32.
77
# Multiple short patches (using native ints) are much faster than long
79
self.Match_MaxBits = 32
83
# The data structure representing a diff is an array of tuples:
84
# [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
85
# which means: delete "Hello", add "Goodbye" and keep " world."
90
def diff_main(self, text1, text2, checklines=True):
91
"""Find the differences between two texts. Simplifies the problem by
92
stripping any common prefix or suffix off the texts before diffing.
95
text1: Old string to be diffed.
96
text2: New string to be diffed.
97
checklines: Optional speedup flag. If present and false, then don't run
98
a line-level diff first to identify the changed areas.
99
Defaults to true, which does a faster, slightly less optimal diff.
106
# Check for equality (speedup)
108
return [(self.DIFF_EQUAL, text1)]
110
# Trim off common prefix (speedup)
111
commonlength = self.diff_commonPrefix(text1, text2)
112
commonprefix = text1[:commonlength]
113
text1 = text1[commonlength:]
114
text2 = text2[commonlength:]
116
# Trim off common suffix (speedup)
117
commonlength = self.diff_commonSuffix(text1, text2)
118
if commonlength == 0:
121
commonsuffix = text1[-commonlength:]
122
text1 = text1[:-commonlength]
123
text2 = text2[:-commonlength]
125
# Compute the diff on the middle block
126
diffs = self.diff_compute(text1, text2, checklines)
128
# Restore the prefix and suffix
130
diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
132
diffs.append((self.DIFF_EQUAL, commonsuffix))
133
self.diff_cleanupMerge(diffs)
136
def diff_compute(self, text1, text2, checklines):
137
"""Find the differences between two texts. Assumes that the texts do
138
not have any common prefix or suffix.
141
text1: Old string to be diffed.
142
text2: New string to be diffed.
143
checklines: Speedup flag. If false, then don't run a line-level diff
144
first to identify the changed areas.
145
If true, then run a faster, slightly less optimal diff.
152
# Just add some text (speedup)
153
return [(self.DIFF_INSERT, text2)]
156
# Just delete some text (speedup)
157
return [(self.DIFF_DELETE, text1)]
159
if len(text1) > len(text2):
160
(longtext, shorttext) = (text1, text2)
162
(shorttext, longtext) = (text1, text2)
163
i = longtext.find(shorttext)
165
# Shorter text is inside the longer text (speedup)
166
diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),
167
(self.DIFF_INSERT, longtext[i + len(shorttext):])]
168
# Swap insertions for deletions if diff is reversed.
169
if len(text1) > len(text2):
170
diffs[0] = (self.DIFF_DELETE, diffs[0][1])
171
diffs[2] = (self.DIFF_DELETE, diffs[2][1])
173
longtext = shorttext = None # Garbage collect.
175
# Check to see if the problem can be split in two.
176
hm = self.diff_halfMatch(text1, text2)
178
# A half-match was found, sort out the return data.
179
(text1_a, text1_b, text2_a, text2_b, mid_common) = hm
180
# Send both pairs off for separate processing.
181
diffs_a = self.diff_main(text1_a, text2_a, checklines)
182
diffs_b = self.diff_main(text1_b, text2_b, checklines)
184
return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
186
# Perform a real diff.
187
if checklines and (len(text1) < 100 or len(text2) < 100):
188
checklines = False # Too trivial for the overhead.
190
# Scan the text on a line-by-line basis first.
191
(text1, text2, linearray) = self.diff_linesToChars(text1, text2)
193
diffs = self.diff_map(text1, text2)
194
if not diffs: # No acceptable result.
195
diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
197
# Convert the diff back to original text.
198
self.diff_charsToLines(diffs, linearray)
199
# Eliminate freak matches (e.g. blank lines)
200
self.diff_cleanupSemantic(diffs)
202
# Rediff any replacement blocks, this time character-by-character.
203
# Add a dummy entry at the end.
204
diffs.append((self.DIFF_EQUAL, ''))
210
while pointer < len(diffs):
211
if diffs[pointer][0] == self.DIFF_INSERT:
213
text_insert += diffs[pointer][1]
214
elif diffs[pointer][0] == self.DIFF_DELETE:
216
text_delete += diffs[pointer][1]
217
elif diffs[pointer][0] == self.DIFF_EQUAL:
218
# Upon reaching an equality, check for prior redundancies.
219
if count_delete >= 1 and count_insert >= 1:
220
# Delete the offending records and add the merged ones.
221
a = self.diff_main(text_delete, text_insert, False)
222
diffs[pointer - count_delete -
223
count_insert: pointer] = a
224
pointer = pointer - count_delete - \
225
count_insert + len(a)
887
text_insert = text_insert[commonlength:]
888
text_delete = text_delete[commonlength:]
889
# Factor out any common suffixies.
890
commonlength = self.diff_commonSuffix(text_insert, text_delete)
891
if commonlength != 0:
892
diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
894
text_insert = text_insert[:-commonlength]
895
text_delete = text_delete[:-commonlength]
896
# Delete the offending records and add the merged ones.
897
if count_delete == 0:
898
diffs[pointer - count_insert : pointer] = [
899
(self.DIFF_INSERT, text_insert)]
900
elif count_insert == 0:
901
diffs[pointer - count_delete : pointer] = [
902
(self.DIFF_DELETE, text_delete)]
904
diffs[pointer - count_delete - count_insert : pointer] = [
905
(self.DIFF_DELETE, text_delete),
906
(self.DIFF_INSERT, text_insert)]
907
pointer = pointer - count_delete - count_insert + 1
908
if count_delete != 0:
910
if count_insert != 0:
912
elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
913
# Merge this equality with the previous one.
914
diffs[pointer - 1] = (diffs[pointer - 1][0],
915
diffs[pointer - 1][1] + diffs[pointer][1])
233
diffs.pop() # Remove the dummy entry at the end.
236
def diff_linesToChars(self, text1, text2):
237
"""Split two texts into an array of strings. Reduce the texts to a
238
string of hashes where each Unicode character represents one line.
242
text2: Second string.
245
Three element tuple, containing the encoded text1, the encoded text2 and
246
the array of unique strings. The zeroth element of the array of unique
247
strings is intentionally blank.
250
lineArray = [] # e.g. lineArray[4] == "Hello\n"
251
lineHash = {} # e.g. lineHash["Hello\n"] == 4
253
# "\x00" is a valid character, but various debuggers don't like it.
254
# So we'll insert a junk entry to avoid generating a null character.
257
def diff_linesToCharsMunge(text):
258
"""Split a text into an array of strings. Reduce the texts to a
259
string of hashes where each Unicode character represents one line.
260
Modifies linearray and linehash through being a closure.
263
text: String to encode.
270
# Walk the text, pulling out a substring for each line.
271
# text.split('\n') would would temporarily double our memory footprint.
272
# Modifying text would create many large strings to garbage
276
while lineEnd < len(text) - 1:
277
lineEnd = text.find('\n', lineStart)
279
lineEnd = len(text) - 1
280
line = text[lineStart:lineEnd + 1]
281
lineStart = lineEnd + 1
284
chars.append(unichr(lineHash[line]))
286
lineArray.append(line)
287
lineHash[line] = len(lineArray) - 1
288
chars.append(unichr(len(lineArray) - 1))
289
return ''.join(chars)
291
chars1 = diff_linesToCharsMunge(text1)
292
chars2 = diff_linesToCharsMunge(text2)
293
return (chars1, chars2, lineArray)
295
def diff_charsToLines(self, diffs, lineArray):
296
"""Rehydrate the text in a diff from a string of line hashes to real
300
diffs: Array of diff tuples.
301
lineArray: Array of unique strings.
304
for x in xrange(len(diffs)):
306
for char in diffs[x][1]:
307
text.append(lineArray[ord(char)])
308
diffs[x] = (diffs[x][0], ''.join(text))
310
def diff_map(self, text1, text2):
311
"""Explore the intersection points between the two texts.
314
text1: Old string to be diffed.
315
text2: New string to be diffed.
318
Array of diff tuples or None if no diff available.
322
# Unlike in most languages, Python counts time in seconds.
323
s_end = time.time() + self.Diff_Timeout # Don't run for too long.
324
# Cache the text lengths to prevent multiple calls.
325
text1_length = len(text1)
326
text2_length = len(text2)
327
max_d = text1_length + text2_length - 1
328
doubleEnd = self.Diff_DualThreshold * 2 < max_d
337
# If the total number of characters is odd, then the front path will
338
# collide with the reverse path.
339
front = (text1_length + text2_length) % 2
340
for d in xrange(max_d):
341
# Bail out if timeout reached.
342
if self.Diff_Timeout > 0 and time.time() > s_end:
345
# Walk the front path one step.
347
for k in xrange(-d, d + 1, 2):
348
if k == -d or k != d and v1[k - 1] < v1[k + 1]:
355
if front and footstep in footsteps:
358
footsteps[footstep] = d
360
while (not done and x < text1_length and y < text2_length and
361
text1[x] == text2[y]):
366
if front and footstep in footsteps:
369
footsteps[footstep] = d
372
v_map1[d][(x, y)] = True
373
if x == text1_length and y == text2_length:
374
# Reached the end in single-path mode.
375
return self.diff_path1(v_map1, text1, text2)
377
# Front path ran over reverse path.
378
v_map2 = v_map2[:footsteps[footstep] + 1]
379
a = self.diff_path1(v_map1, text1[:x], text2[:y])
380
b = self.diff_path2(v_map2, text1[x:], text2[y:])
384
# Walk the reverse path one step.
386
for k in xrange(-d, d + 1, 2):
387
if k == -d or k != d and v2[k - 1] < v2[k + 1]:
392
footstep = (text1_length - x, text2_length - y)
393
if not front and footstep in footsteps:
396
footsteps[footstep] = d
397
while (not done and x < text1_length and y < text2_length and
398
text1[-x - 1] == text2[-y - 1]):
401
footstep = (text1_length - x, text2_length - y)
402
if not front and footstep in footsteps:
405
footsteps[footstep] = d
408
v_map2[d][(x, y)] = True
410
# Reverse path ran over front path.
411
v_map1 = v_map1[:footsteps[footstep] + 1]
412
a = self.diff_path1(v_map1, text1[:text1_length - x],
413
text2[:text2_length - y])
414
b = self.diff_path2(v_map2, text1[text1_length - x:],
415
text2[text2_length - y:])
418
# Number of diffs equals number of characters, no commonality at all.
421
def diff_path1(self, v_map, text1, text2):
422
"""Work from the middle back to the start to determine the path.
425
v_map: Array of paths.
426
text1: Old string fragment to be diffed.
427
text2: New string fragment to be diffed.
430
Array of diff tuples.
437
for d in xrange(len(v_map) - 2, -1, -1):
439
if (x - 1, y) in v_map[d]:
441
if last_op == self.DIFF_DELETE:
442
path[0] = (self.DIFF_DELETE, text1[x] + path[0][1])
444
path[:0] = [(self.DIFF_DELETE, text1[x])]
445
last_op = self.DIFF_DELETE
447
elif (x, y - 1) in v_map[d]:
449
if last_op == self.DIFF_INSERT:
450
path[0] = (self.DIFF_INSERT, text2[y] + path[0][1])
452
path[:0] = [(self.DIFF_INSERT, text2[y])]
453
last_op = self.DIFF_INSERT
458
assert text1[x] == text2[y], ('No diagonal. ' +
459
"Can't happen. (diff_path1)")
460
if last_op == self.DIFF_EQUAL:
461
path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1])
463
path[:0] = [(self.DIFF_EQUAL, text1[x])]
464
last_op = self.DIFF_EQUAL
467
def diff_path2(self, v_map, text1, text2):
468
"""Work from the middle back to the end to determine the path.
471
v_map: Array of paths.
472
text1: Old string fragment to be diffed.
473
text2: New string fragment to be diffed.
476
Array of diff tuples.
483
for d in xrange(len(v_map) - 2, -1, -1):
485
if (x - 1, y) in v_map[d]:
487
if last_op == self.DIFF_DELETE:
488
path[-1] = (self.DIFF_DELETE, path[-1]
491
path.append((self.DIFF_DELETE, text1[-x - 1]))
492
last_op = self.DIFF_DELETE
494
elif (x, y - 1) in v_map[d]:
496
if last_op == self.DIFF_INSERT:
497
path[-1] = (self.DIFF_INSERT, path[-1]
500
path.append((self.DIFF_INSERT, text2[-y - 1]))
501
last_op = self.DIFF_INSERT
506
assert text1[-x - 1] == text2[-y - 1], ('No diagonal. ' +
507
"Can't happen. (diff_path2)")
508
if last_op == self.DIFF_EQUAL:
509
path[-1] = (self.DIFF_EQUAL, path[-1]
512
path.append((self.DIFF_EQUAL, text1[-x - 1]))
513
last_op = self.DIFF_EQUAL
516
def diff_commonPrefix(self, text1, text2):
517
"""Determine the common prefix of two strings.
521
text2: Second string.
524
The number of characters common to the start of each string.
527
# Quick check for common null cases.
528
if not text1 or not text2 or text1[0] != text2[0]:
531
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
533
pointermax = min(len(text1), len(text2))
534
pointermid = pointermax
536
while pointermin < pointermid:
537
if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
538
pointermin = pointermid
539
pointerstart = pointermin
541
pointermax = pointermid
542
pointermid = int((pointermax - pointermin) / 2 + pointermin)
545
def diff_commonSuffix(self, text1, text2):
546
"""Determine the common suffix of two strings.
550
text2: Second string.
553
The number of characters common to the end of each string.
556
# Quick check for common null cases.
557
if not text1 or not text2 or text1[-1] != text2[-1]:
560
# Performance analysis: http://neil.fraser.name/news/2007/10/09/
562
pointermax = min(len(text1), len(text2))
563
pointermid = pointermax
565
while pointermin < pointermid:
566
if (text1[-pointermid:len(text1) - pointerend] ==
567
text2[-pointermid:len(text2) - pointerend]):
568
pointermin = pointermid
569
pointerend = pointermin
571
pointermax = pointermid
572
pointermid = int((pointermax - pointermin) / 2 + pointermin)
575
def diff_halfMatch(self, text1, text2):
576
"""Do the two texts share a substring which is at least half the length
581
text2: Second string.
584
Five element Array, containing the prefix of text1, the suffix of text1,
585
the prefix of text2, the suffix of text2 and the common middle. Or None
586
if there was no match.
589
if len(text1) > len(text2):
590
(longtext, shorttext) = (text1, text2)
592
(shorttext, longtext) = (text1, text2)
593
if len(longtext) < 10 or len(shorttext) < 1:
594
return None # Pointless.
596
def diff_halfMatchI(longtext, shorttext, i):
597
"""Does a substring of shorttext exist within longtext such that
598
the substring is at least half the length of longtext? Closure, but
599
does not reference any external variables.
602
longtext: Longer string.
603
shorttext: Shorter string.
604
i: Start index of quarter length substring within longtext.
607
Five element Array, containing the prefix of longtext, the suffix of
608
longtext, the prefix of shorttext, the suffix of shorttext and the
609
common middle. Or None if there was no match.
612
seed = longtext[i:i + len(longtext) / 4]
614
j = shorttext.find(seed)
616
prefixLength = self.diff_commonPrefix(
617
longtext[i:], shorttext[j:])
618
suffixLength = self.diff_commonSuffix(
619
longtext[:i], shorttext[:j])
620
if len(best_common) < suffixLength + prefixLength:
621
best_common = (shorttext[j - suffixLength:j] +
622
shorttext[j:j + prefixLength])
623
best_longtext_a = longtext[:i - suffixLength]
624
best_longtext_b = longtext[i + prefixLength:]
625
best_shorttext_a = shorttext[:j - suffixLength]
626
best_shorttext_b = shorttext[j + prefixLength:]
627
j = shorttext.find(seed, j + 1)
629
if len(best_common) >= len(longtext) / 2:
630
return (best_longtext_a, best_longtext_b,
631
best_shorttext_a, best_shorttext_b, best_common)
635
# First check if the second quarter is the seed for a half-match.
636
hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)
637
# Check again based on the third quarter.
638
hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)
639
if not hm1 and not hm2:
646
# Both matched. Select the longest.
647
if len(hm1[4]) > len(hm2[4]):
652
# A half-match was found, sort out the return data.
653
if len(text1) > len(text2):
654
(text1_a, text1_b, text2_a, text2_b, mid_common) = hm
656
(text2_a, text2_b, text1_a, text1_b, mid_common) = hm
657
return (text1_a, text1_b, text2_a, text2_b, mid_common)
659
def diff_cleanupSemantic(self, diffs):
660
"""Reduce the number of edits by eliminating semantically trivial
664
diffs: Array of diff tuples.
668
equalities = [] # Stack of indices where equalities are found.
669
lastequality = None # Always equal to equalities[-1][1]
670
pointer = 0 # Index of current position.
671
# Number of chars that changed prior to the equality.
673
length_changes2 = 0 # Number of chars that changed after the equality.
674
while pointer < len(diffs):
675
if diffs[pointer][0] == self.DIFF_EQUAL: # equality found
676
equalities.append(pointer)
677
length_changes1 = length_changes2
679
lastequality = diffs[pointer][1]
680
else: # an insertion or deletion
681
length_changes2 += len(diffs[pointer][1])
682
if (lastequality != None and (len(lastequality) <= length_changes1) and
683
(len(lastequality) <= length_changes2)):
685
diffs.insert(equalities[-1],
686
(self.DIFF_DELETE, lastequality))
687
# Change second copy to insert.
688
diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
689
diffs[equalities[-1] + 1][1])
690
# Throw away the equality we just deleted.
692
# Throw away the previous equality (it needs to be
694
if len(equalities) != 0:
697
pointer = equalities[-1]
700
length_changes1 = 0 # Reset the counters.
707
self.diff_cleanupMerge(diffs)
709
self.diff_cleanupSemanticLossless(diffs)
711
def diff_cleanupSemanticLossless(self, diffs):
712
"""Look for single edits surrounded on both sides by equalities
713
which can be shifted sideways to align the edit to a word boundary.
714
e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
717
diffs: Array of diff tuples.
720
def diff_cleanupSemanticScore(one, two):
721
"""Given two strings, compute a score representing whether the
722
internal boundary falls on logical boundaries. Scores range from 5
723
(best) to 0 (worst). Closure, but does not reference any external
734
if not one or not two:
735
# Edges are the best.
738
# Each port of this function behaves slightly differently due to
739
# subtle differences in each language's definition of things like
740
# 'whitespace'. Since this function's purpose is largely cosmetic,
741
# the choice has been made to use each language's native features
742
# rather than force total conformity.
744
# One point for non-alphanumeric.
745
if not one[-1].isalnum() or not two[0].isalnum():
747
# Two points for whitespace.
748
if one[-1].isspace() or two[0].isspace():
750
# Three points for line breaks.
751
if (one[-1] == '\r' or one[-1] == '\n' or
752
two[0] == '\r' or two[0] == '\n'):
754
# Four points for blank lines.
755
if (re.search('\\n\\r?\\n$', one) or
756
re.match('^\\r?\\n\\r?\\n', two)):
761
# Intentionally ignore the first and last element (don't need
763
while pointer < len(diffs) - 1:
764
if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
765
diffs[pointer + 1][0] == self.DIFF_EQUAL):
766
# This is a single edit surrounded by equalities.
767
equality1 = diffs[pointer - 1][1]
768
edit = diffs[pointer][1]
769
equality2 = diffs[pointer + 1][1]
771
# First, shift the edit as far left as possible.
772
commonOffset = self.diff_commonSuffix(equality1, edit)
774
commonString = edit[-commonOffset:]
775
equality1 = equality1[:-commonOffset]
776
edit = commonString + edit[:-commonOffset]
777
equality2 = commonString + equality2
779
# Second, step character by character right, looking for the
781
bestEquality1 = equality1
783
bestEquality2 = equality2
784
bestScore = (diff_cleanupSemanticScore(equality1, edit) +
785
diff_cleanupSemanticScore(edit, equality2))
786
while edit and equality2 and edit[0] == equality2[0]:
788
edit = edit[1:] + equality2[0]
789
equality2 = equality2[1:]
790
score = (diff_cleanupSemanticScore(equality1, edit) +
791
diff_cleanupSemanticScore(edit, equality2))
792
# The >= encourages trailing rather than leading whitespace
794
if score >= bestScore:
796
bestEquality1 = equality1
798
bestEquality2 = equality2
800
if diffs[pointer - 1][1] != bestEquality1:
801
# We have an improvement, save it back to the diff.
803
diffs[pointer - 1] = (diffs[pointer - 1]
806
del diffs[pointer - 1]
808
diffs[pointer] = (diffs[pointer][0], bestEdit)
810
diffs[pointer + 1] = (diffs[pointer + 1]
813
del diffs[pointer + 1]
817
def diff_cleanupEfficiency(self, diffs):
818
"""Reduce the number of edits by eliminating operationally trivial
822
diffs: Array of diff tuples.
826
equalities = [] # Stack of indices where equalities are found.
827
lastequality = '' # Always equal to equalities[-1][1]
828
pointer = 0 # Index of current position.
829
# Is there an insertion operation before the last equality.
831
# Is there a deletion operation before the last equality.
833
# Is there an insertion operation after the last equality.
835
# Is there a deletion operation after the last equality.
837
while pointer < len(diffs):
838
if diffs[pointer][0] == self.DIFF_EQUAL: # equality found
839
if (len(diffs[pointer][1]) < self.Diff_EditCost and
840
(post_ins or post_del)):
842
equalities.append(pointer)
845
lastequality = diffs[pointer][1]
847
# Not a candidate, and can never become one.
851
post_ins = post_del = False
852
else: # an insertion or deletion
853
if diffs[pointer][0] == self.DIFF_DELETE:
858
# Five types to be split:
859
# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
860
# <ins>A</ins>X<ins>C</ins><del>D</del>
861
# <ins>A</ins><del>B</del>X<ins>C</ins>
862
# <ins>A</del>X<ins>C</ins><del>D</del>
863
# <ins>A</ins><del>B</del>X<del>C</del>
865
if lastequality and ((pre_ins and pre_del and post_ins and post_del) or
866
((len(lastequality) < self.Diff_EditCost / 2) and
867
(pre_ins + pre_del + post_ins + post_del) == 3)):
869
diffs.insert(equalities[-1],
870
(self.DIFF_DELETE, lastequality))
871
# Change second copy to insert.
872
diffs[equalities[-1] + 1] = (self.DIFF_INSERT,
873
diffs[equalities[-1] + 1][1])
874
equalities.pop() # Throw away the equality we just deleted
876
if pre_ins and pre_del:
877
# No changes made which could affect previous entry,
879
post_ins = post_del = True
883
equalities.pop() # Throw away the previous equality
885
pointer = equalities[-1]
888
post_ins = post_del = False
893
self.diff_cleanupMerge(diffs)
895
def diff_cleanupMerge(self, diffs):
896
"""Reorder and merge like edit sections. Merge equalities. Any edit
897
section can move as long as it doesn't cross an equality.
900
diffs: Array of diff tuples.
903
diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.
925
if diffs[-1][1] == '':
926
diffs.pop() # Remove the dummy entry at the end.
928
# Second pass: look for single edits surrounded on both sides by equalities
929
# which can be shifted sideways to eliminate an equality.
930
# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
933
# Intentionally ignore the first and last element (don't need checking).
934
while pointer < len(diffs) - 1:
935
if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
936
diffs[pointer + 1][0] == self.DIFF_EQUAL):
937
# This is a single edit surrounded by equalities.
938
if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
939
# Shift the edit over the previous equality.
940
diffs[pointer] = (diffs[pointer][0],
941
diffs[pointer - 1][1] +
942
diffs[pointer][1][:-len(diffs[pointer - 1][1])])
943
diffs[pointer + 1] = (diffs[pointer + 1][0],
944
diffs[pointer - 1][1] + diffs[pointer + 1][1])
945
del diffs[pointer - 1]
947
elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
948
# Shift the edit over the next equality.
949
diffs[pointer - 1] = (diffs[pointer - 1][0],
950
diffs[pointer - 1][1] + diffs[pointer + 1][1])
951
diffs[pointer] = (diffs[pointer][0],
952
diffs[pointer][1][len(diffs[pointer + 1][1]):] +
953
diffs[pointer + 1][1])
954
del diffs[pointer + 1]
958
# If shifts were made, the diff needs reordering and another shift sweep.
960
self.diff_cleanupMerge(diffs)
962
def diff_xIndex(self, diffs, loc):
963
"""loc is a location in text1, compute and return the equivalent location
964
in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
967
diffs: Array of diff tuples.
968
loc: Location within text1.
971
Location within text2.
977
for x in xrange(len(diffs)):
978
(op, text) = diffs[x]
979
if op != self.DIFF_INSERT: # Equality or deletion.
981
if op != self.DIFF_DELETE: # Equality or insertion.
983
if chars1 > loc: # Overshot the location.
988
if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
989
# The location was deleted.
991
# Add the remaining len(character).
992
return last_chars2 + (loc - last_chars1)
994
def diff_prettyHtml(self, diffs):
995
"""Convert a diff array into a pretty HTML report.
998
diffs: Array of diff tuples.
1001
HTML representation.
1005
for (op, data) in diffs:
1006
text = (data.replace("&", "&").replace("<", "<")
1007
.replace(">", ">").replace("\n", "¶<BR>"))
1008
if op == self.DIFF_INSERT:
1009
html.append("<SPAN STYLE=\"color: #00FF00;\" TITLE=\"i=%i\">%s</SPAN>"
1011
elif op == self.DIFF_DELETE:
1012
html.append("<SPAN STYLE=\"color: #FF0000;\" TITLE=\"i=%i\">%s</SPAN>"
1014
elif op == self.DIFF_EQUAL:
1015
html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text))
1016
if op != self.DIFF_DELETE:
1018
return "".join(html)
1020
def diff_text1(self, diffs):
1021
"""Compute and return the source text (all equalities and deletions).
1024
diffs: Array of diff tuples.
1030
for (op, data) in diffs:
1031
if op != self.DIFF_INSERT:
1033
return "".join(text)
1035
def diff_text2(self, diffs):
1036
"""Compute and return the destination text (all equalities and insertions).
1039
diffs: Array of diff tuples.
1045
for (op, data) in diffs:
1046
if op != self.DIFF_DELETE:
1048
return "".join(text)
1050
def diff_levenshtein(self, diffs):
1051
"""Compute the Levenshtein distance; the number of inserted, deleted or
1052
substituted characters.
1055
diffs: Array of diff tuples.
1063
for (op, data) in diffs:
1064
if op == self.DIFF_INSERT:
1065
insertions += len(data)
1066
elif op == self.DIFF_DELETE:
1067
deletions += len(data)
1068
elif op == self.DIFF_EQUAL:
1069
# A deletion and an insertion is one substitution.
1070
levenshtein += max(insertions, deletions)
909
while pointer < len(diffs):
910
if diffs[pointer][0] == self.DIFF_INSERT:
912
text_insert += diffs[pointer][1]
914
elif diffs[pointer][0] == self.DIFF_DELETE:
916
text_delete += diffs[pointer][1]
918
elif diffs[pointer][0] == self.DIFF_EQUAL:
919
# Upon reaching an equality, check for prior redundancies.
920
if count_delete != 0 or count_insert != 0:
921
if count_delete != 0 and count_insert != 0:
922
# Factor out any common prefixies.
923
commonlength = self.diff_commonPrefix(
924
text_insert, text_delete)
925
if commonlength != 0:
926
x = pointer - count_delete - count_insert - 1
927
if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
928
diffs[x] = (diffs[x][0], diffs[x][1] +
929
text_insert[:commonlength])
932
0, (self.DIFF_EQUAL, text_insert[:commonlength]))
934
text_insert = text_insert[commonlength:]
935
text_delete = text_delete[commonlength:]
936
# Factor out any common suffixies.
937
commonlength = self.diff_commonSuffix(
938
text_insert, text_delete)
939
if commonlength != 0:
940
diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +
942
text_insert = text_insert[:-commonlength]
943
text_delete = text_delete[:-commonlength]
944
# Delete the offending records and add the merged ones.
945
if count_delete == 0:
946
diffs[pointer - count_insert: pointer] = [
947
(self.DIFF_INSERT, text_insert)]
948
elif count_insert == 0:
949
diffs[pointer - count_delete: pointer] = [
950
(self.DIFF_DELETE, text_delete)]
952
diffs[pointer - count_delete - count_insert: pointer] = [
953
(self.DIFF_DELETE, text_delete),
954
(self.DIFF_INSERT, text_insert)]
955
pointer = pointer - count_delete - count_insert + 1
956
if count_delete != 0:
958
if count_insert != 0:
960
elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
961
# Merge this equality with the previous one.
962
diffs[pointer - 1] = (diffs[pointer - 1][0],
963
diffs[pointer - 1][1] + diffs[pointer][1])
973
if diffs[-1][1] == '':
974
diffs.pop() # Remove the dummy entry at the end.
976
# Second pass: look for single edits surrounded on both sides by equalities
977
# which can be shifted sideways to eliminate an equality.
978
# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
981
# Intentionally ignore the first and last element (don't need
983
while pointer < len(diffs) - 1:
984
if (diffs[pointer - 1][0] == self.DIFF_EQUAL and
985
diffs[pointer + 1][0] == self.DIFF_EQUAL):
986
# This is a single edit surrounded by equalities.
987
if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
988
# Shift the edit over the previous equality.
989
diffs[pointer] = (diffs[pointer][0],
990
diffs[pointer - 1][1] +
991
diffs[pointer][1][:-len(diffs[pointer - 1][1])])
992
diffs[pointer + 1] = (diffs[pointer + 1][0],
993
diffs[pointer - 1][1] + diffs[pointer + 1][1])
994
del diffs[pointer - 1]
996
elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
997
# Shift the edit over the next equality.
998
diffs[pointer - 1] = (diffs[pointer - 1][0],
999
diffs[pointer - 1][1] + diffs[pointer + 1][1])
1000
diffs[pointer] = (diffs[pointer][0],
1001
diffs[pointer][1][len(diffs[pointer + 1][1]):] +
1002
diffs[pointer + 1][1])
1003
del diffs[pointer + 1]
1007
# If shifts were made, the diff needs reordering and another shift
1010
self.diff_cleanupMerge(diffs)
1012
def diff_xIndex(self, diffs, loc):
1013
"""loc is a location in text1, compute and return the equivalent location
1014
in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1017
diffs: Array of diff tuples.
1018
loc: Location within text1.
1021
Location within text2.
1027
for x in xrange(len(diffs)):
1028
(op, text) = diffs[x]
1029
if op != self.DIFF_INSERT: # Equality or deletion.
1031
if op != self.DIFF_DELETE: # Equality or insertion.
1033
if chars1 > loc: # Overshot the location.
1035
last_chars1 = chars1
1036
last_chars2 = chars2
1038
if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1039
# The location was deleted.
1041
# Add the remaining len(character).
1042
return last_chars2 + (loc - last_chars1)
1044
def diff_prettyHtml(self, diffs):
1045
"""Convert a diff array into a pretty HTML report.
1048
diffs: Array of diff tuples.
1051
HTML representation.
1056
for (op, data) in diffs:
1057
text = (data.replace('&', '&').replace('<', '<')
1058
.replace('>', '>').replace('\n', '¶<BR>'))
1059
if op == self.DIFF_INSERT:
1060
html.append("<SPAN STYLE=\"color: #00FF00;\" TITLE=\"i=%i\">%s</SPAN>"
1062
elif op == self.DIFF_DELETE:
1063
html.append("<SPAN STYLE=\"color: #FF0000;\" TITLE=\"i=%i\">%s</SPAN>"
1065
elif op == self.DIFF_EQUAL:
1066
html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text))
1067
if op != self.DIFF_DELETE:
1069
return ''.join(html)
1071
def diff_text1(self, diffs):
1072
"""Compute and return the source text (all equalities and deletions).
1075
diffs: Array of diff tuples.
1082
for (op, data) in diffs:
1083
if op != self.DIFF_INSERT:
1085
return ''.join(text)
1087
def diff_text2(self, diffs):
1088
"""Compute and return the destination text (all equalities and
1092
diffs: Array of diff tuples.
1099
for (op, data) in diffs:
1100
if op != self.DIFF_DELETE:
1102
return ''.join(text)
1104
def diff_levenshtein(self, diffs):
1105
"""Compute the Levenshtein distance; the number of inserted, deleted or
1106
substituted characters.
1109
diffs: Array of diff tuples.
1073
levenshtein += max(insertions, deletions)
1076
def diff_toDelta(self, diffs):
1077
"""Crush the diff into an encoded string which describes the operations
1078
required to transform text1 into text2.
1079
E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1080
Operations are tab-separated. Inserted text is escaped using %xx notation.
1083
diffs: Array of diff tuples.
1089
for (op, data) in diffs:
1090
if op == self.DIFF_INSERT:
1091
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
1092
data = data.encode("utf-8")
1093
text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))
1094
elif op == self.DIFF_DELETE:
1095
text.append("-%d" % len(data))
1096
elif op == self.DIFF_EQUAL:
1097
text.append("=%d" % len(data))
1098
return "\t".join(text)
1100
def diff_fromDelta(self, text1, delta):
1101
"""Given the original text1, and an encoded string which describes the
1102
operations required to transform text1 into text2, compute the full diff.
1105
text1: Source string for the diff.
1109
Array of diff tuples.
1112
ValueError: If invalid input.
1114
if type(delta) == unicode:
1115
# Deltas should be composed of a subset of ascii chars, Unicode not
1116
# required. If this encode raises UnicodeEncodeError, delta is invalid.
1117
delta = delta.encode("ascii")
1119
pointer = 0 # Cursor in text1
1120
tokens = delta.split("\t")
1121
for token in tokens:
1123
# Blank tokens are ok (from a trailing \t).
1125
# Each token begins with a one character parameter which specifies the
1126
# operation of this token (delete, insert, equality).
1129
param = urllib.unquote(param).decode("utf-8")
1130
diffs.append((self.DIFF_INSERT, param))
1131
elif token[0] == "-" or token[0] == "=":
1135
raise ValueError, "Invalid number in diff_fromDelta: " + param
1137
raise ValueError, "Negative number in diff_fromDelta: " + param
1138
text = text1[pointer : pointer + n]
1141
diffs.append((self.DIFF_EQUAL, text))
1143
diffs.append((self.DIFF_DELETE, text))
1145
# Anything else is an error.
1146
raise ValueError, ("Invalid diff operation in diff_fromDelta: " +
1148
if pointer != len(text1):
1150
"Delta length (%d) does not equal source text length (%d)." %
1151
(pointer, len(text1)))
1156
def match_main(self, text, pattern, loc):
1157
"""Locate the best instance of 'pattern' in 'text' near 'loc'.
1160
text: The text to search.
1161
pattern: The pattern to search for.
1162
loc: The location to search around.
1165
Best match index or -1.
1167
loc = max(0, min(loc, len(text)))
1169
# Shortcut (potentially not guaranteed by the algorithm)
1174
elif text[loc:loc + len(pattern)] == pattern:
1175
# Perfect match at the perfect spot! (Includes case of null pattern)
1178
# Do a fuzzy compare.
1179
match = self.match_bitap(text, pattern, loc)
1182
def match_bitap(self, text, pattern, loc):
1183
"""Locate the best instance of 'pattern' in 'text' near 'loc' using the
1187
text: The text to search.
1188
pattern: The pattern to search for.
1189
loc: The location to search around.
1192
Best match index or -1.
1194
# Python doesn't have a maxint limit, so ignore this check.
1195
#if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1196
# raise ValueError("Pattern too long for this application.")
1198
# Initialise the alphabet.
1199
s = self.match_alphabet(pattern)
1201
def match_bitapScore(e, x):
1202
"""Compute and return the score for a match with e errors and x location.
1203
Accesses loc and pattern through being a closure.
1206
e: Number of errors in match.
1207
x: Location of match.
1210
Overall score for match (0.0 = good, 1.0 = bad).
1212
accuracy = float(e) / len(pattern)
1213
proximity = abs(loc - x)
1214
if not self.Match_Distance:
1215
# Dodge divide by zero error.
1216
return proximity and 1.0 or accuracy
1217
return accuracy + (proximity / float(self.Match_Distance))
1219
# Highest score beyond which we give up.
1220
score_threshold = self.Match_Threshold
1221
# Is there a nearby exact match? (speedup)
1222
best_loc = text.find(pattern, loc)
1224
score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1225
# What about in the other direction? (speedup)
1226
best_loc = text.rfind(pattern, loc + len(pattern))
1228
score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1230
# Initialise the bit arrays.
1231
matchmask = 1 << (len(pattern) - 1)
1234
bin_max = len(pattern) + len(text)
1235
# Empty initialization added to appease pychecker.
1237
for d in xrange(len(pattern)):
1238
# Scan for the best match each iteration allows for one more error.
1239
# Run a binary search to determine how far from 'loc' we can stray at
1243
while bin_min < bin_mid:
1244
if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1248
bin_mid = (bin_max - bin_min) / 2 + bin_min
1250
# Use the result from this iteration as the maximum for the next.
1252
start = max(1, loc - bin_mid + 1)
1253
finish = min(loc + bin_mid, len(text)) + len(pattern)
1255
rd = range(finish + 1)
1256
rd.append((1 << d) - 1)
1257
for j in xrange(finish, start - 1, -1):
1258
if len(text) <= j - 1:
1262
charMatch = s.get(text[j - 1], 0)
1263
if d == 0: # First pass: exact match.
1264
rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1265
else: # Subsequent passes: fuzzy match.
1266
rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (
1267
((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1268
if rd[j] & matchmask:
1269
score = match_bitapScore(d, j - 1)
1270
# This match will almost certainly be better than any existing match.
1272
if score <= score_threshold:
1274
score_threshold = score
1277
# When passing loc, don't exceed our current distance from loc.
1278
start = max(1, 2 * loc - best_loc)
1118
for (op, data) in diffs:
1119
if op == self.DIFF_INSERT:
1120
insertions += len(data)
1121
elif op == self.DIFF_DELETE:
1122
deletions += len(data)
1123
elif op == self.DIFF_EQUAL:
1124
# A deletion and an insertion is one substitution.
1125
levenshtein += max(insertions, deletions)
1128
levenshtein += max(insertions, deletions)
1131
def diff_toDelta(self, diffs):
1132
"""Crush the diff into an encoded string which describes the operations
1133
required to transform text1 into text2.
1134
E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1135
Operations are tab-separated. Inserted text is escaped using %xx notation.
1138
diffs: Array of diff tuples.
1144
for (op, data) in diffs:
1145
if op == self.DIFF_INSERT:
1146
# High ascii will raise UnicodeDecodeError. Use Unicode
1148
data = data.encode('utf-8')
1149
text.append('+' + urllib.quote(data, "!~*'();/?:@&=+$,# "))
1150
elif op == self.DIFF_DELETE:
1151
text.append('-%d' % len(data))
1152
elif op == self.DIFF_EQUAL:
1153
text.append('=%d' % len(data))
1154
return '\t'.join(text)
1156
def diff_fromDelta(self, text1, delta):
1157
"""Given the original text1, and an encoded string which describes the
1158
operations required to transform text1 into text2, compute the full
1162
text1: Source string for the diff.
1166
Array of diff tuples.
1169
ValueError: If invalid input.
1172
if type(delta) == unicode:
1173
# Deltas should be composed of a subset of ascii chars, Unicode not
1174
# required. If this encode raises UnicodeEncodeError, delta is
1176
delta = delta.encode('ascii')
1178
pointer = 0 # Cursor in text1
1179
tokens = delta.split('\t')
1180
for token in tokens:
1182
# Blank tokens are ok (from a trailing \t).
1184
# Each token begins with a one character parameter which specifies the
1185
# operation of this token (delete, insert, equality).
1188
param = urllib.unquote(param).decode('utf-8')
1189
diffs.append((self.DIFF_INSERT, param))
1190
elif token[0] == '-' or token[0] == '=':
1194
raise ValueError, 'Invalid number in diff_fromDelta: ' + param
1196
raise ValueError, 'Negative number in diff_fromDelta: ' + param
1197
text = text1[pointer: pointer + n]
1200
diffs.append((self.DIFF_EQUAL, text))
1202
diffs.append((self.DIFF_DELETE, text))
1280
# Already passed loc, downhill from here on in.
1282
# No hope for a (better) match at greater error levels.
1283
if match_bitapScore(d + 1, loc) > score_threshold:
1288
def match_alphabet(self, pattern):
1289
"""Initialise the alphabet for the Bitap algorithm.
1292
pattern: The text to encode.
1295
Hash of character locations.
1298
for char in pattern:
1300
for i in xrange(len(pattern)):
1301
s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1306
def patch_addContext(self, patch, text):
1307
"""Increase the context until it is unique,
1308
but don't let the pattern expand beyond Match_MaxBits.
1311
patch: The patch to grow.
1316
pattern = text[patch.start2 : patch.start2 + patch.length1]
1319
# Look for the first and last matches of pattern in text. If two different
1320
# matches are found, increase the pattern length.
1321
while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
1322
0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
1323
self.Patch_Margin)):
1324
padding += self.Patch_Margin
1325
pattern = text[max(0, patch.start2 - padding) :
1326
patch.start2 + patch.length1 + padding]
1327
# Add one chunk for good luck.
1328
padding += self.Patch_Margin
1331
prefix = text[max(0, patch.start2 - padding) : patch.start2]
1333
patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1335
suffix = text[patch.start2 + patch.length1 :
1336
patch.start2 + patch.length1 + padding]
1338
patch.diffs.append((self.DIFF_EQUAL, suffix))
1340
# Roll back the start points.
1341
patch.start1 -= len(prefix)
1342
patch.start2 -= len(prefix)
1344
patch.length1 += len(prefix) + len(suffix)
1345
patch.length2 += len(prefix) + len(suffix)
1347
def patch_make(self, a, b=None, c=None):
1348
"""Compute a list of patches to turn text1 into text2.
1349
Use diffs if provided, otherwise compute it ourselves.
1350
There are four ways to call this function, depending on what data is
1351
available to the caller:
1353
a = text1, b = text2
1357
a = text1, b = diffs
1358
Method 4 (deprecated, use method 3):
1359
a = text1, b = text2, c = diffs
1362
a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1364
b: text2 (methods 1,4) or Array of diff tuples for text1 to
1365
text2 (method 3) or undefined (method 2).
1366
c: Array of diff tuples for text1 to text2 (method 4) or
1367
undefined (methods 1,2,3).
1370
Array of patch objects.
1374
# Note that texts may arrive as 'str' or 'unicode'.
1375
if isinstance(a, basestring) and isinstance(b, basestring) and c is None:
1376
# Method 1: text1, text2
1377
# Compute diffs from text1 and text2.
1379
diffs = self.diff_main(text1, b, True)
1381
self.diff_cleanupSemantic(diffs)
1382
self.diff_cleanupEfficiency(diffs)
1383
elif isinstance(a, list) and b is None and c is None:
1385
# Compute text1 from diffs.
1387
text1 = self.diff_text1(diffs)
1388
elif isinstance(a, basestring) and isinstance(b, list) and c is None:
1389
# Method 3: text1, diffs
1392
elif (isinstance(a, basestring) and isinstance(b, basestring) and
1393
isinstance(c, list)):
1394
# Method 4: text1, text2, diffs
1395
# text2 is not used.
1399
raise ValueError("Unknown call format to patch_make.")
1402
return [] # Get rid of the None case.
1405
char_count1 = 0 # Number of characters into the text1 string.
1406
char_count2 = 0 # Number of characters into the text2 string.
1407
prepatch_text = text1 # Recreate the patches to determine context info.
1408
postpatch_text = text1
1409
for x in xrange(len(diffs)):
1410
(diff_type, diff_text) = diffs[x]
1411
if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1412
# A new patch starts here.
1413
patch.start1 = char_count1
1414
patch.start2 = char_count2
1415
if diff_type == self.DIFF_INSERT:
1417
patch.diffs.append(diffs[x])
1418
patch.length2 += len(diff_text)
1419
postpatch_text = (postpatch_text[:char_count2] + diff_text +
1420
postpatch_text[char_count2:])
1421
elif diff_type == self.DIFF_DELETE:
1423
patch.length1 += len(diff_text)
1424
patch.diffs.append(diffs[x])
1425
postpatch_text = (postpatch_text[:char_count2] +
1426
postpatch_text[char_count2 + len(diff_text):])
1427
elif (diff_type == self.DIFF_EQUAL and
1428
len(diff_text) <= 2 * self.Patch_Margin and
1429
len(patch.diffs) != 0 and len(diffs) != x + 1):
1430
# Small equality inside a patch.
1431
patch.diffs.append(diffs[x])
1432
patch.length1 += len(diff_text)
1433
patch.length2 += len(diff_text)
1435
if (diff_type == self.DIFF_EQUAL and
1436
len(diff_text) >= 2 * self.Patch_Margin):
1437
# Time for a new patch.
1204
# Anything else is an error.
1205
raise ValueError, ('Invalid diff operation in diff_fromDelta: ' +
1207
if pointer != len(text1):
1209
'Delta length (%d) does not equal source text length (%d).' %
1210
(pointer, len(text1)))
1215
def match_main(self, text, pattern, loc):
1216
"""Locate the best instance of 'pattern' in 'text' near 'loc'.
1219
text: The text to search.
1220
pattern: The pattern to search for.
1221
loc: The location to search around.
1224
Best match index or -1.
1227
loc = max(0, min(loc, len(text)))
1229
# Shortcut (potentially not guaranteed by the algorithm)
1234
elif text[loc:loc + len(pattern)] == pattern:
1235
# Perfect match at the perfect spot! (Includes case of null
1239
# Do a fuzzy compare.
1240
match = self.match_bitap(text, pattern, loc)
1243
def match_bitap(self, text, pattern, loc):
1244
"""Locate the best instance of 'pattern' in 'text' near 'loc' using the
1248
text: The text to search.
1249
pattern: The pattern to search for.
1250
loc: The location to search around.
1253
Best match index or -1.
1256
# Python doesn't have a maxint limit, so ignore this check.
1257
# if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1258
# raise ValueError("Pattern too long for this application.")
1260
# Initialise the alphabet.
1261
s = self.match_alphabet(pattern)
1263
def match_bitapScore(e, x):
1264
"""Compute and return the score for a match with e errors and x
1265
location. Accesses loc and pattern through being a closure.
1268
e: Number of errors in match.
1269
x: Location of match.
1272
Overall score for match (0.0 = good, 1.0 = bad).
1275
accuracy = float(e) / len(pattern)
1276
proximity = abs(loc - x)
1277
if not self.Match_Distance:
1278
# Dodge divide by zero error.
1279
return proximity and 1.0 or accuracy
1280
return accuracy + (proximity / float(self.Match_Distance))
1282
# Highest score beyond which we give up.
1283
score_threshold = self.Match_Threshold
1284
# Is there a nearby exact match? (speedup)
1285
best_loc = text.find(pattern, loc)
1287
score_threshold = min(match_bitapScore(
1288
0, best_loc), score_threshold)
1289
# What about in the other direction? (speedup)
1290
best_loc = text.rfind(pattern, loc + len(pattern))
1292
score_threshold = min(match_bitapScore(
1293
0, best_loc), score_threshold)
1295
# Initialise the bit arrays.
1296
matchmask = 1 << (len(pattern) - 1)
1299
bin_max = len(pattern) + len(text)
1300
# Empty initialization added to appease pychecker.
1302
for d in xrange(len(pattern)):
1303
# Scan for the best match each iteration allows for one more error.
1304
# Run a binary search to determine how far from 'loc' we can stray at
1308
while bin_min < bin_mid:
1309
if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1313
bin_mid = (bin_max - bin_min) / 2 + bin_min
1315
# Use the result from this iteration as the maximum for the next.
1317
start = max(1, loc - bin_mid + 1)
1318
finish = min(loc + bin_mid, len(text)) + len(pattern)
1320
rd = range(finish + 1)
1321
rd.append((1 << d) - 1)
1322
for j in xrange(finish, start - 1, -1):
1323
if len(text) <= j - 1:
1327
charMatch = s.get(text[j - 1], 0)
1328
if d == 0: # First pass: exact match.
1329
rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1330
else: # Subsequent passes: fuzzy match.
1331
rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (
1332
((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]
1333
if rd[j] & matchmask:
1334
score = match_bitapScore(d, j - 1)
1335
# This match will almost certainly be better than any existing match.
1337
if score <= score_threshold:
1339
score_threshold = score
1342
# When passing loc, don't exceed our current
1343
# distance from loc.
1344
start = max(1, 2 * loc - best_loc)
1346
# Already passed loc, downhill from here on in.
1348
# No hope for a (better) match at greater error levels.
1349
if match_bitapScore(d + 1, loc) > score_threshold:
1354
def match_alphabet(self, pattern):
1355
"""Initialise the alphabet for the Bitap algorithm.
1358
pattern: The text to encode.
1361
Hash of character locations.
1365
for char in pattern:
1367
for i in xrange(len(pattern)):
1368
s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1373
def patch_addContext(self, patch, text):
1374
"""Increase the context until it is unique, but don't let the pattern
1375
expand beyond Match_MaxBits.
1378
patch: The patch to grow.
1384
pattern = text[patch.start2: patch.start2 + patch.length1]
1387
# Look for the first and last matches of pattern in text. If two different
1388
# matches are found, increase the pattern length.
1389
while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==
1390
0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -
1391
self.Patch_Margin)):
1392
padding += self.Patch_Margin
1393
pattern = text[max(0, patch.start2 - padding):
1394
patch.start2 + patch.length1 + padding]
1395
# Add one chunk for good luck.
1396
padding += self.Patch_Margin
1399
prefix = text[max(0, patch.start2 - padding): patch.start2]
1401
patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1403
suffix = text[patch.start2 + patch.length1:
1404
patch.start2 + patch.length1 + padding]
1406
patch.diffs.append((self.DIFF_EQUAL, suffix))
1408
# Roll back the start points.
1409
patch.start1 -= len(prefix)
1410
patch.start2 -= len(prefix)
1412
patch.length1 += len(prefix) + len(suffix)
1413
patch.length2 += len(prefix) + len(suffix)
1415
def patch_make(self, a, b=None, c=None):
1416
"""Compute a list of patches to turn text1 into text2.
1417
Use diffs if provided, otherwise compute it ourselves.
1418
There are four ways to call this function, depending on what data is
1419
available to the caller:
1421
a = text1, b = text2
1425
a = text1, b = diffs
1426
Method 4 (deprecated, use method 3):
1427
a = text1, b = text2, c = diffs
1430
a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1432
b: text2 (methods 1,4) or Array of diff tuples for text1 to
1433
text2 (method 3) or undefined (method 2).
1434
c: Array of diff tuples for text1 to text2 (method 4) or
1435
undefined (methods 1,2,3).
1438
Array of patch objects.
1442
# Note that texts may arrive as 'str' or 'unicode'.
1443
if isinstance(a, basestring) and isinstance(b, basestring) and c is None:
1444
# Method 1: text1, text2
1445
# Compute diffs from text1 and text2.
1447
diffs = self.diff_main(text1, b, True)
1449
self.diff_cleanupSemantic(diffs)
1450
self.diff_cleanupEfficiency(diffs)
1451
elif isinstance(a, list) and b is None and c is None:
1453
# Compute text1 from diffs.
1455
text1 = self.diff_text1(diffs)
1456
elif isinstance(a, basestring) and isinstance(b, list) and c is None:
1457
# Method 3: text1, diffs
1460
elif (isinstance(a, basestring) and isinstance(b, basestring) and
1461
isinstance(c, list)):
1462
# Method 4: text1, text2, diffs
1463
# text2 is not used.
1467
raise ValueError('Unknown call format to patch_make.')
1470
return [] # Get rid of the None case.
1473
char_count1 = 0 # Number of characters into the text1 string.
1474
char_count2 = 0 # Number of characters into the text2 string.
1475
# Recreate the patches to determine context info.
1476
prepatch_text = text1
1477
postpatch_text = text1
1478
for x in xrange(len(diffs)):
1479
(diff_type, diff_text) = diffs[x]
1480
if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1481
# A new patch starts here.
1482
patch.start1 = char_count1
1483
patch.start2 = char_count2
1484
if diff_type == self.DIFF_INSERT:
1486
patch.diffs.append(diffs[x])
1487
patch.length2 += len(diff_text)
1488
postpatch_text = (postpatch_text[:char_count2] + diff_text +
1489
postpatch_text[char_count2:])
1490
elif diff_type == self.DIFF_DELETE:
1492
patch.length1 += len(diff_text)
1493
patch.diffs.append(diffs[x])
1494
postpatch_text = (postpatch_text[:char_count2] +
1495
postpatch_text[char_count2 + len(diff_text):])
1496
elif (diff_type == self.DIFF_EQUAL and
1497
len(diff_text) <= 2 * self.Patch_Margin and
1498
len(patch.diffs) != 0 and len(diffs) != x + 1):
1499
# Small equality inside a patch.
1500
patch.diffs.append(diffs[x])
1501
patch.length1 += len(diff_text)
1502
patch.length2 += len(diff_text)
1504
if (diff_type == self.DIFF_EQUAL and
1505
len(diff_text) >= 2 * self.Patch_Margin):
1506
# Time for a new patch.
1507
if len(patch.diffs) != 0:
1508
self.patch_addContext(patch, prepatch_text)
1509
patches.append(patch)
1511
# Unlike Unidiff, our patch lists have a rolling context.
1512
# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1513
# Update prepatch text & pos to reflect the application of the
1514
# just completed patch.
1515
prepatch_text = postpatch_text
1516
char_count1 = char_count2
1518
# Update the current character count.
1519
if diff_type != self.DIFF_INSERT:
1520
char_count1 += len(diff_text)
1521
if diff_type != self.DIFF_DELETE:
1522
char_count2 += len(diff_text)
1524
# Pick up the leftover patch if not empty.
1438
1525
if len(patch.diffs) != 0:
1439
self.patch_addContext(patch, prepatch_text)
1440
patches.append(patch)
1442
# Unlike Unidiff, our patch lists have a rolling context.
1443
# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff
1444
# Update prepatch text & pos to reflect the application of the
1445
# just completed patch.
1446
prepatch_text = postpatch_text
1447
char_count1 = char_count2
1449
# Update the current character count.
1450
if diff_type != self.DIFF_INSERT:
1451
char_count1 += len(diff_text)
1452
if diff_type != self.DIFF_DELETE:
1453
char_count2 += len(diff_text)
1455
# Pick up the leftover patch if not empty.
1456
if len(patch.diffs) != 0:
1457
self.patch_addContext(patch, prepatch_text)
1458
patches.append(patch)
1461
def patch_deepCopy(self, patches):
1462
"""Given an array of patches, return another array that is identical.
1465
patches: Array of patch objects.
1468
Array of patch objects.
1471
for patch in patches:
1472
patchCopy = patch_obj()
1473
# No need to deep copy the tuples since they are immutable.
1474
patchCopy.diffs = patch.diffs[:]
1475
patchCopy.start1 = patch.start1
1476
patchCopy.start2 = patch.start2
1477
patchCopy.length1 = patch.length1
1478
patchCopy.length2 = patch.length2
1479
patchesCopy.append(patchCopy)
1482
def patch_apply(self, patches, text):
1483
"""Merge a set of patches onto the text. Return a patched text, as well
1484
as a list of true/false values indicating which patches were applied.
1487
patches: Array of patch objects.
1491
Two element Array, containing the new text and an array of boolean values.
1496
# Deep copy the patches so that no changes are made to originals.
1497
patches = self.patch_deepCopy(patches)
1499
nullPadding = self.patch_addPadding(patches)
1500
text = nullPadding + text + nullPadding
1501
self.patch_splitMax(patches)
1503
# delta keeps track of the offset between the expected and actual location
1504
# of the previous patch. If there are patches expected at positions 10 and
1505
# 20, but the first patch was found at 12, delta is 2 and the second patch
1506
# has an effective expected position of 22.
1509
for patch in patches:
1510
expected_loc = patch.start2 + delta
1511
text1 = self.diff_text1(patch.diffs)
1513
if len(text1) > self.Match_MaxBits:
1514
# patch_splitMax will only provide an oversized pattern in the case of
1516
start_loc = self.match_main(text, text1[:self.Match_MaxBits],
1519
end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
1520
expected_loc + len(text1) - self.Match_MaxBits)
1521
if end_loc == -1 or start_loc >= end_loc:
1522
# Can't find valid trailing context. Drop this patch.
1525
start_loc = self.match_main(text, text1, expected_loc)
1527
# No match found. :(
1528
results.append(False)
1529
# Subtract the delta for this failed patch from subsequent patches.
1530
delta -= patch.length2 - patch.length1
1533
results.append(True)
1534
delta = start_loc - expected_loc
1536
text2 = text[start_loc : start_loc + len(text1)]
1538
text2 = text[start_loc : end_loc + self.Match_MaxBits]
1540
# Perfect match, just shove the replacement text in.
1541
text = (text[:start_loc] + self.diff_text2(patch.diffs) +
1542
text[start_loc + len(text1):])
1545
# Run a diff to get a framework of equivalent indices.
1546
diffs = self.diff_main(text1, text2, False)
1547
if (len(text1) > self.Match_MaxBits and
1548
self.diff_levenshtein(diffs) / float(len(text1)) >
1549
self.Patch_DeleteThreshold):
1550
# The end points match, but the content is unacceptably bad.
1553
self.diff_cleanupSemanticLossless(diffs)
1555
for (op, data) in patch.diffs:
1556
if op != self.DIFF_EQUAL:
1557
index2 = self.diff_xIndex(diffs, index1)
1558
if op == self.DIFF_INSERT: # Insertion
1559
text = text[:start_loc + index2] + data + text[start_loc +
1561
elif op == self.DIFF_DELETE: # Deletion
1562
text = text[:start_loc + index2] + text[start_loc +
1563
self.diff_xIndex(diffs, index1 + len(data)):]
1564
if op != self.DIFF_DELETE:
1566
# Strip the padding off.
1567
text = text[len(nullPadding):-len(nullPadding)]
1568
return (text, results)
1570
def patch_addPadding(self, patches):
1571
"""Add some padding on text start and end so that edges can match
1572
something. Intended to be called only from within patch_apply.
1575
patches: Array of patch objects.
1578
The padding string added to each side.
1580
paddingLength = self.Patch_Margin
1582
for x in xrange(1, paddingLength + 1):
1583
nullPadding += chr(x)
1585
# Bump all the patches forward.
1586
for patch in patches:
1587
patch.start1 += paddingLength
1588
patch.start2 += paddingLength
1590
# Add some padding on start of first diff.
1593
if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1594
# Add nullPadding equality.
1595
diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1596
patch.start1 -= paddingLength # Should be 0.
1597
patch.start2 -= paddingLength # Should be 0.
1598
patch.length1 += paddingLength
1599
patch.length2 += paddingLength
1600
elif paddingLength > len(diffs[0][1]):
1601
# Grow first equality.
1602
extraLength = paddingLength - len(diffs[0][1])
1603
newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
1604
diffs[0] = (diffs[0][0], newText)
1605
patch.start1 -= extraLength
1606
patch.start2 -= extraLength
1607
patch.length1 += extraLength
1608
patch.length2 += extraLength
1610
# Add some padding on end of last diff.
1613
if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1614
# Add nullPadding equality.
1615
diffs.append((self.DIFF_EQUAL, nullPadding))
1616
patch.length1 += paddingLength
1617
patch.length2 += paddingLength
1618
elif paddingLength > len(diffs[-1][1]):
1619
# Grow last equality.
1620
extraLength = paddingLength - len(diffs[-1][1])
1621
newText = diffs[-1][1] + nullPadding[:extraLength]
1622
diffs[-1] = (diffs[-1][0], newText)
1623
patch.length1 += extraLength
1624
patch.length2 += extraLength
1628
def patch_splitMax(self, patches):
1629
"""Look through the patches and break up any which are longer than the
1630
maximum limit of the match algorithm.
1633
patches: Array of patch objects.
1635
if self.Match_MaxBits == 0:
1637
for x in xrange(len(patches)):
1638
if patches[x].length1 > self.Match_MaxBits:
1639
bigpatch = patches[x]
1640
# Remove the big old patch.
1643
patch_size = self.Match_MaxBits
1644
start1 = bigpatch.start1
1645
start2 = bigpatch.start2
1647
while len(bigpatch.diffs) != 0:
1648
# Create one of several smaller patches.
1651
patch.start1 = start1 - len(precontext)
1652
patch.start2 = start2 - len(precontext)
1654
patch.length1 = patch.length2 = len(precontext)
1655
patch.diffs.append((self.DIFF_EQUAL, precontext))
1657
while (len(bigpatch.diffs) != 0 and
1658
patch.length1 < patch_size - self.Patch_Margin):
1659
(diff_type, diff_text) = bigpatch.diffs[0]
1660
if diff_type == self.DIFF_INSERT:
1661
# Insertions are harmless.
1662
patch.length2 += len(diff_text)
1663
start2 += len(diff_text)
1664
patch.diffs.append(bigpatch.diffs.pop(0))
1666
elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
1667
patch.diffs[0][0] == self.DIFF_EQUAL and
1668
len(diff_text) > 2 * patch_size):
1669
# This is a large deletion. Let it pass in one chunk.
1670
patch.length1 += len(diff_text)
1671
start1 += len(diff_text)
1673
patch.diffs.append((diff_type, diff_text))
1674
del bigpatch.diffs[0]
1676
# Deletion or equality. Only take as much as we can stomach.
1677
diff_text = diff_text[:patch_size - patch.length1 -
1679
patch.length1 += len(diff_text)
1680
start1 += len(diff_text)
1681
if diff_type == self.DIFF_EQUAL:
1682
patch.length2 += len(diff_text)
1683
start2 += len(diff_text)
1687
patch.diffs.append((diff_type, diff_text))
1688
if diff_text == bigpatch.diffs[0][1]:
1689
del bigpatch.diffs[0]
1691
bigpatch.diffs[0] = (bigpatch.diffs[0][0],
1692
bigpatch.diffs[0][1][len(diff_text):])
1694
# Compute the head context for the next patch.
1695
precontext = self.diff_text2(patch.diffs)
1696
precontext = precontext[-self.Patch_Margin:]
1697
# Append the end context for this patch.
1698
postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]
1700
patch.length1 += len(postcontext)
1701
patch.length2 += len(postcontext)
1702
if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1703
patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
1706
patch.diffs.append((self.DIFF_EQUAL, postcontext))
1710
patches.insert(x, patch)
1712
def patch_toText(self, patches):
1713
"""Take a list of patches and return a textual representation.
1716
patches: Array of patch objects.
1719
Text representation of patches.
1722
for patch in patches:
1723
text.append(str(patch))
1724
return "".join(text)
1726
def patch_fromText(self, textline):
1727
"""Parse a textual representation of patches and return a list of patch
1731
textline: Text representation of patches.
1734
Array of patch objects.
1737
ValueError: If invalid input.
1739
if type(textline) == unicode:
1740
# Patches should be composed of a subset of ascii chars, Unicode not
1741
# required. If this encode raises UnicodeEncodeError, patch is invalid.
1742
textline = textline.encode("ascii")
1746
text = textline.split('\n')
1747
while len(text) != 0:
1748
m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1750
raise ValueError, "Invalid patch string: " + text[0]
1752
patches.append(patch)
1753
patch.start1 = int(m.group(1))
1754
if m.group(2) == '':
1757
elif m.group(2) == '0':
1761
patch.length1 = int(m.group(2))
1763
patch.start2 = int(m.group(3))
1764
if m.group(4) == '':
1767
elif m.group(4) == '0':
1771
patch.length2 = int(m.group(4))
1775
while len(text) != 0:
1780
line = urllib.unquote(text[0][1:])
1781
line = line.decode("utf-8")
1784
patch.diffs.append((self.DIFF_INSERT, line))
1787
patch.diffs.append((self.DIFF_DELETE, line))
1790
patch.diffs.append((self.DIFF_EQUAL, line))
1792
# Start of next patch.
1795
# Blank line? Whatever.
1799
raise ValueError, "Invalid patch mode: '%s'\n%s" % (sign, line)
1526
self.patch_addContext(patch, prepatch_text)
1527
patches.append(patch)
1530
def patch_deepCopy(self, patches):
1531
"""Given an array of patches, return another array that is identical.
1534
patches: Array of patch objects.
1537
Array of patch objects.
1541
for patch in patches:
1542
patchCopy = patch_obj()
1543
# No need to deep copy the tuples since they are immutable.
1544
patchCopy.diffs = patch.diffs[:]
1545
patchCopy.start1 = patch.start1
1546
patchCopy.start2 = patch.start2
1547
patchCopy.length1 = patch.length1
1548
patchCopy.length2 = patch.length2
1549
patchesCopy.append(patchCopy)
1552
def patch_apply(self, patches, text):
1553
"""Merge a set of patches onto the text. Return a patched text, as
1554
well as a list of true/false values indicating which patches were
1558
patches: Array of patch objects.
1562
Two element Array, containing the new text and an array of boolean values.
1568
# Deep copy the patches so that no changes are made to originals.
1569
patches = self.patch_deepCopy(patches)
1571
nullPadding = self.patch_addPadding(patches)
1572
text = nullPadding + text + nullPadding
1573
self.patch_splitMax(patches)
1575
# delta keeps track of the offset between the expected and actual location
1576
# of the previous patch. If there are patches expected at positions 10 and
1577
# 20, but the first patch was found at 12, delta is 2 and the second patch
1578
# has an effective expected position of 22.
1581
for patch in patches:
1582
expected_loc = patch.start2 + delta
1583
text1 = self.diff_text1(patch.diffs)
1585
if len(text1) > self.Match_MaxBits:
1586
# patch_splitMax will only provide an oversized pattern in the case of
1588
start_loc = self.match_main(text, text1[:self.Match_MaxBits],
1591
end_loc = self.match_main(text, text1[-self.Match_MaxBits:],
1592
expected_loc + len(text1) - self.Match_MaxBits)
1593
if end_loc == -1 or start_loc >= end_loc:
1594
# Can't find valid trailing context. Drop this patch.
1597
start_loc = self.match_main(text, text1, expected_loc)
1599
# No match found. :(
1600
results.append(False)
1601
# Subtract the delta for this failed patch from subsequent
1603
delta -= patch.length2 - patch.length1
1606
results.append(True)
1607
delta = start_loc - expected_loc
1609
text2 = text[start_loc: start_loc + len(text1)]
1611
text2 = text[start_loc: end_loc + self.Match_MaxBits]
1613
# Perfect match, just shove the replacement text in.
1614
text = (text[:start_loc] + self.diff_text2(patch.diffs) +
1615
text[start_loc + len(text1):])
1618
# Run a diff to get a framework of equivalent indices.
1619
diffs = self.diff_main(text1, text2, False)
1620
if (len(text1) > self.Match_MaxBits and
1621
self.diff_levenshtein(diffs) / float(len(text1)) >
1622
self.Patch_DeleteThreshold):
1623
# The end points match, but the content is unacceptably
1627
self.diff_cleanupSemanticLossless(diffs)
1629
for (op, data) in patch.diffs:
1630
if op != self.DIFF_EQUAL:
1631
index2 = self.diff_xIndex(diffs, index1)
1632
if op == self.DIFF_INSERT: # Insertion
1633
text = text[:start_loc + index2] + data + text[start_loc +
1635
elif op == self.DIFF_DELETE: # Deletion
1636
text = text[:start_loc + index2] + text[start_loc +
1637
self.diff_xIndex(diffs, index1 + len(data)):]
1638
if op != self.DIFF_DELETE:
1640
# Strip the padding off.
1641
text = text[len(nullPadding):-len(nullPadding)]
1642
return (text, results)
1644
def patch_addPadding(self, patches):
1645
"""Add some padding on text start and end so that edges can match
1646
something. Intended to be called only from within patch_apply.
1649
patches: Array of patch objects.
1652
The padding string added to each side.
1655
paddingLength = self.Patch_Margin
1657
for x in xrange(1, paddingLength + 1):
1658
nullPadding += chr(x)
1660
# Bump all the patches forward.
1661
for patch in patches:
1662
patch.start1 += paddingLength
1663
patch.start2 += paddingLength
1665
# Add some padding on start of first diff.
1668
if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1669
# Add nullPadding equality.
1670
diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1671
patch.start1 -= paddingLength # Should be 0.
1672
patch.start2 -= paddingLength # Should be 0.
1673
patch.length1 += paddingLength
1674
patch.length2 += paddingLength
1675
elif paddingLength > len(diffs[0][1]):
1676
# Grow first equality.
1677
extraLength = paddingLength - len(diffs[0][1])
1678
newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]
1679
diffs[0] = (diffs[0][0], newText)
1680
patch.start1 -= extraLength
1681
patch.start2 -= extraLength
1682
patch.length1 += extraLength
1683
patch.length2 += extraLength
1685
# Add some padding on end of last diff.
1688
if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1689
# Add nullPadding equality.
1690
diffs.append((self.DIFF_EQUAL, nullPadding))
1691
patch.length1 += paddingLength
1692
patch.length2 += paddingLength
1693
elif paddingLength > len(diffs[-1][1]):
1694
# Grow last equality.
1695
extraLength = paddingLength - len(diffs[-1][1])
1696
newText = diffs[-1][1] + nullPadding[:extraLength]
1697
diffs[-1] = (diffs[-1][0], newText)
1698
patch.length1 += extraLength
1699
patch.length2 += extraLength
1703
def patch_splitMax(self, patches):
1704
"""Look through the patches and break up any which are longer than the
1705
maximum limit of the match algorithm.
1708
patches: Array of patch objects.
1711
if self.Match_MaxBits == 0:
1713
for x in xrange(len(patches)):
1714
if patches[x].length1 > self.Match_MaxBits:
1715
bigpatch = patches[x]
1716
# Remove the big old patch.
1719
patch_size = self.Match_MaxBits
1720
start1 = bigpatch.start1
1721
start2 = bigpatch.start2
1723
while len(bigpatch.diffs) != 0:
1724
# Create one of several smaller patches.
1727
patch.start1 = start1 - len(precontext)
1728
patch.start2 = start2 - len(precontext)
1730
patch.length1 = patch.length2 = len(precontext)
1731
patch.diffs.append((self.DIFF_EQUAL, precontext))
1733
while (len(bigpatch.diffs) != 0 and
1734
patch.length1 < patch_size - self.Patch_Margin):
1735
(diff_type, diff_text) = bigpatch.diffs[0]
1736
if diff_type == self.DIFF_INSERT:
1737
# Insertions are harmless.
1738
patch.length2 += len(diff_text)
1739
start2 += len(diff_text)
1740
patch.diffs.append(bigpatch.diffs.pop(0))
1742
elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and
1743
patch.diffs[0][0] == self.DIFF_EQUAL and
1744
len(diff_text) > 2 * patch_size):
1745
# This is a large deletion. Let it pass in one
1747
patch.length1 += len(diff_text)
1748
start1 += len(diff_text)
1750
patch.diffs.append((diff_type, diff_text))
1751
del bigpatch.diffs[0]
1753
# Deletion or equality. Only take as much as we
1755
diff_text = diff_text[:patch_size - patch.length1 -
1757
patch.length1 += len(diff_text)
1758
start1 += len(diff_text)
1759
if diff_type == self.DIFF_EQUAL:
1760
patch.length2 += len(diff_text)
1761
start2 += len(diff_text)
1765
patch.diffs.append((diff_type, diff_text))
1766
if diff_text == bigpatch.diffs[0][1]:
1767
del bigpatch.diffs[0]
1769
bigpatch.diffs[0] = (bigpatch.diffs[0][0],
1770
bigpatch.diffs[0][1][len(diff_text):])
1772
# Compute the head context for the next patch.
1773
precontext = self.diff_text2(patch.diffs)
1774
precontext = precontext[-self.Patch_Margin:]
1775
# Append the end context for this patch.
1776
postcontext = self.diff_text1(bigpatch.diffs)[
1779
patch.length1 += len(postcontext)
1780
patch.length2 += len(postcontext)
1781
if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1782
patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +
1785
patch.diffs.append((self.DIFF_EQUAL, postcontext))
1789
patches.insert(x, patch)
1791
def patch_toText(self, patches):
1792
"""Take a list of patches and return a textual representation.
1795
patches: Array of patch objects.
1798
Text representation of patches.
1802
for patch in patches:
1803
text.append(str(patch))
1804
return ''.join(text)
1806
def patch_fromText(self, textline):
1807
"""Parse a textual representation of patches and return a list of patch
1811
textline: Text representation of patches.
1814
Array of patch objects.
1817
ValueError: If invalid input.
1820
if type(textline) == unicode:
1821
# Patches should be composed of a subset of ascii chars, Unicode not
1822
# required. If this encode raises UnicodeEncodeError, patch is
1824
textline = textline.encode('ascii')
1828
text = textline.split('\n')
1829
while len(text) != 0:
1830
m = re.match('^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$', text[0])
1832
raise ValueError, 'Invalid patch string: ' + text[0]
1834
patches.append(patch)
1835
patch.start1 = int(m.group(1))
1836
if m.group(2) == '':
1839
elif m.group(2) == '0':
1843
patch.length1 = int(m.group(2))
1845
patch.start2 = int(m.group(3))
1846
if m.group(4) == '':
1849
elif m.group(4) == '0':
1853
patch.length2 = int(m.group(4))
1857
while len(text) != 0:
1862
line = urllib.unquote(text[0][1:])
1863
line = line.decode('utf-8')
1866
patch.diffs.append((self.DIFF_INSERT, line))
1869
patch.diffs.append((self.DIFF_DELETE, line))
1872
patch.diffs.append((self.DIFF_EQUAL, line))
1874
# Start of next patch.
1877
# Blank line? Whatever.
1881
raise ValueError, "Invalid patch mode: '%s'\n%s" % (sign, line)
1804
1886
class patch_obj:
1805
"""Class representing one patch operation.
1809
"""Initializes with an empty list of diffs.
1818
"""Emmulate GNU diff's format.
1819
Header: @@ -382,8 +481,9 @@
1820
Indicies are printed as 1-based, not 0-based.
1823
The GNU diff string.
1825
if self.length1 == 0:
1826
coords1 = str(self.start1) + ",0"
1827
elif self.length1 == 1:
1828
coords1 = str(self.start1 + 1)
1830
coords1 = str(self.start1 + 1) + "," + str(self.length1)
1831
if self.length2 == 0:
1832
coords2 = str(self.start2) + ",0"
1833
elif self.length2 == 1:
1834
coords2 = str(self.start2 + 1)
1836
coords2 = str(self.start2 + 1) + "," + str(self.length2)
1837
text = ["@@ -", coords1, " +", coords2, " @@\n"]
1838
# Escape the body of the patch with %xx notation.
1839
for (op, data) in self.diffs:
1840
if op == diff_match_patch.DIFF_INSERT:
1842
elif op == diff_match_patch.DIFF_DELETE:
1844
elif op == diff_match_patch.DIFF_EQUAL:
1846
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
1847
data = data.encode("utf-8")
1848
text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
1849
return "".join(text)
1887
"""Class representing one patch operation."""
1890
"""Initializes with an empty list of diffs."""
1898
"""Emmulate GNU diff's format.
1899
Header: @@ -382,8 +481,9 @@
1900
Indicies are printed as 1-based, not 0-based.
1903
The GNU diff string.
1905
if self.length1 == 0:
1906
coords1 = str(self.start1) + ',0'
1907
elif self.length1 == 1:
1908
coords1 = str(self.start1 + 1)
1910
coords1 = str(self.start1 + 1) + ',' + str(self.length1)
1911
if self.length2 == 0:
1912
coords2 = str(self.start2) + ',0'
1913
elif self.length2 == 1:
1914
coords2 = str(self.start2 + 1)
1916
coords2 = str(self.start2 + 1) + ',' + str(self.length2)
1917
text = ['@@ -', coords1, ' +', coords2, ' @@\n']
1918
# Escape the body of the patch with %xx notation.
1919
for (op, data) in self.diffs:
1920
if op == diff_match_patch.DIFF_INSERT:
1922
elif op == diff_match_patch.DIFF_DELETE:
1924
elif op == diff_match_patch.DIFF_EQUAL:
1926
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
1927
data = data.encode('utf-8')
1928
text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + '\n')
1929
return ''.join(text)