~widelands-dev/widelands-website/trunk : revision 173.2.2

1

#!/usr/bin/python2.4

2

3

# Taken from google because they do not provide a setup.py file. Thanks anyway

4

5

"""Diff Match and Patch

6

7

8

http://code.google.com/p/google-diff-match-patch/

9

10

Licensed under the Apache License, Version 2.0 (the "License");

11

you may not use this file except in compliance with the License.

12

You may obtain a copy of the License at

13

14

http://www.apache.org/licenses/LICENSE-2.0

15

16

Unless required by applicable law or agreed to in writing, software

17

distributed under the License is distributed on an "AS IS" BASIS,

18

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

19

See the License for the specific language governing permissions and

20

limitations under the License.

21

"""

22

23

"""Functions for diff, match and patch.

24

25

Computes the difference between two texts to create a patch.

26

Applies the patch onto another text, allowing for errors.

27

"""

28

29

__author__ = 'fraser@google.com (Neil Fraser)'

30

31

import math

32

import time

33

import urllib

34

import re

35

36

class diff_match_patch:

37

"""Class containing the diff, match and patch methods.

38

39

Also contains the behaviour settings.

40

"""

41

42

def __init__(self):

43

"""Inits a diff_match_patch object with default settings.

44

Redefine these in your program to override the defaults.

45

"""

46

47

# Number of seconds to map a diff before giving up (0 for infinity).

48

self.Diff_Timeout = 1.0

49

# Cost of an empty edit operation in terms of edit characters.

50

self.Diff_EditCost = 4

51

# The size beyond which the double-ended diff activates.

52

# Double-ending is twice as fast, but less accurate.

53

self.Diff_DualThreshold = 32

54

# At what point is no match declared (0.0 = perfection, 1.0 = very loose).

55

self.Match_Threshold = 0.5

56

# How far to search for a match (0 = exact location, 1000+ = broad match).

57

# A match this many characters away from the expected location will add

58

# 1.0 to the score (0.0 is a perfect match).

59

self.Match_Distance = 1000

60

# When deleting a large block of text (over ~64 characters), how close does

61

# the contents have to match the expected contents. (0.0 = perfection,

62

# 1.0 = very loose). Note that Match_Threshold controls how closely the

63

# end points of a delete need to match.

64

self.Patch_DeleteThreshold = 0.5

65

# Chunk size for context length.

66

self.Patch_Margin = 4

67

68

# How many bits in a number?

69

# Python has no maximum, thus to disable patch splitting set to 0.

70

# However to avoid long patches in certain pathological cases, use 32.

71

# Multiple short patches (using native ints) are much faster than long ones.

72

self.Match_MaxBits = 32

73

74

# DIFF FUNCTIONS

75

76

# The data structure representing a diff is an array of tuples:

77

# [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]

78

# which means: delete "Hello", add "Goodbye" and keep " world."

79

DIFF_DELETE = -1

80

DIFF_INSERT = 1

81

DIFF_EQUAL = 0

82

83

def diff_main(self, text1, text2, checklines=True):

84

"""Find the differences between two texts. Simplifies the problem by

85

stripping any common prefix or suffix off the texts before diffing.

86

87

Args:

88

text1: Old string to be diffed.

89

text2: New string to be diffed.

90

checklines: Optional speedup flag. If present and false, then don't run

91

a line-level diff first to identify the changed areas.

92

Defaults to true, which does a faster, slightly less optimal diff.

93

94

Returns:

95

Array of changes.

96

"""

97

98

# Check for equality (speedup)

99

if text1 == text2:

100

return [(self.DIFF_EQUAL, text1)]

101

102

# Trim off common prefix (speedup)

103

commonlength = self.diff_commonPrefix(text1, text2)

104

commonprefix = text1[:commonlength]

105

text1 = text1[commonlength:]

106

text2 = text2[commonlength:]

107

108

# Trim off common suffix (speedup)

109

commonlength = self.diff_commonSuffix(text1, text2)

110

if commonlength == 0:

111

commonsuffix = ''

112

else:

113

commonsuffix = text1[-commonlength:]

114

text1 = text1[:-commonlength]

115

text2 = text2[:-commonlength]

116

117

# Compute the diff on the middle block

118

diffs = self.diff_compute(text1, text2, checklines)

119

120

# Restore the prefix and suffix

121

if commonprefix:

122

diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]

123

if commonsuffix:

124

diffs.append((self.DIFF_EQUAL, commonsuffix))

125

self.diff_cleanupMerge(diffs)

126

return diffs

127

128

def diff_compute(self, text1, text2, checklines):

129

"""Find the differences between two texts. Assumes that the texts do not

130

have any common prefix or suffix.

131

132

Args:

133

text1: Old string to be diffed.

134

text2: New string to be diffed.

135

checklines: Speedup flag. If false, then don't run a line-level diff

136

first to identify the changed areas.

137

If true, then run a faster, slightly less optimal diff.

138

139

Returns:

140

Array of changes.

141

"""

142

if not text1:

143

# Just add some text (speedup)

144

return [(self.DIFF_INSERT, text2)]

145

146

if not text2:

147

# Just delete some text (speedup)

148

return [(self.DIFF_DELETE, text1)]

149

150

if len(text1) > len(text2):

151

(longtext, shorttext) = (text1, text2)

152

else:

153

(shorttext, longtext) = (text1, text2)

154

i = longtext.find(shorttext)

155

if i != -1:

156

# Shorter text is inside the longer text (speedup)

157

diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),

158

(self.DIFF_INSERT, longtext[i + len(shorttext):])]

159

# Swap insertions for deletions if diff is reversed.

160

if len(text1) > len(text2):

161

diffs[0] = (self.DIFF_DELETE, diffs[0][1])

162

diffs[2] = (self.DIFF_DELETE, diffs[2][1])

163

return diffs

164

longtext = shorttext = None # Garbage collect.

165

166

# Check to see if the problem can be split in two.

167

hm = self.diff_halfMatch(text1, text2)

168

if hm:

169

# A half-match was found, sort out the return data.

170

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

171

# Send both pairs off for separate processing.

172

diffs_a = self.diff_main(text1_a, text2_a, checklines)

173

diffs_b = self.diff_main(text1_b, text2_b, checklines)

174

# Merge the results.

175

return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b

176

177

# Perform a real diff.

178

if checklines and (len(text1) < 100 or len(text2) < 100):

179

checklines = False # Too trivial for the overhead.

180

if checklines:

181

# Scan the text on a line-by-line basis first.

182

(text1, text2, linearray) = self.diff_linesToChars(text1, text2)

183

184

diffs = self.diff_map(text1, text2)

185

if not diffs: # No acceptable result.

186

diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]

187

if checklines:

188

# Convert the diff back to original text.

189

self.diff_charsToLines(diffs, linearray)

190

# Eliminate freak matches (e.g. blank lines)

191

self.diff_cleanupSemantic(diffs)

192

193

# Rediff any replacement blocks, this time character-by-character.

194

# Add a dummy entry at the end.

195

diffs.append((self.DIFF_EQUAL, ''))

196

pointer = 0

197

count_delete = 0

198

count_insert = 0

199

text_delete = ''

200

text_insert = ''

201

while pointer < len(diffs):

202

if diffs[pointer][0] == self.DIFF_INSERT:

203

count_insert += 1

204

text_insert += diffs[pointer][1]

205

elif diffs[pointer][0] == self.DIFF_DELETE:

206

count_delete += 1

207

text_delete += diffs[pointer][1]

208

elif diffs[pointer][0] == self.DIFF_EQUAL:

209

# Upon reaching an equality, check for prior redundancies.

210

if count_delete >= 1 and count_insert >= 1:

211

# Delete the offending records and add the merged ones.

212

a = self.diff_main(text_delete, text_insert, False)

213

diffs[pointer - count_delete - count_insert : pointer] = a

214

pointer = pointer - count_delete - count_insert + len(a)

215

count_insert = 0

216

count_delete = 0

217

text_delete = ''

218

text_insert = ''

219

220

pointer += 1

221

222

diffs.pop() # Remove the dummy entry at the end.

223

return diffs

224

225

def diff_linesToChars(self, text1, text2):

226

"""Split two texts into an array of strings. Reduce the texts to a string

227

of hashes where each Unicode character represents one line.

228

229

Args:

230

text1: First string.

231

text2: Second string.

232

233

Returns:

234

Three element tuple, containing the encoded text1, the encoded text2 and

235

the array of unique strings. The zeroth element of the array of unique

236

strings is intentionally blank.

237

"""

238

lineArray = [] # e.g. lineArray[4] == "Hello\n"

239

lineHash = {} # e.g. lineHash["Hello\n"] == 4

240

241

# "\x00" is a valid character, but various debuggers don't like it.

242

# So we'll insert a junk entry to avoid generating a null character.

243

lineArray.append('')

244

245

def diff_linesToCharsMunge(text):

246

"""Split a text into an array of strings. Reduce the texts to a string

247

of hashes where each Unicode character represents one line.

248

Modifies linearray and linehash through being a closure.

249

250

Args:

251

text: String to encode.

252

253

Returns:

254

Encoded string.

255

"""

256

chars = []

257

# Walk the text, pulling out a substring for each line.

258

# text.split('\n') would would temporarily double our memory footprint.

259

# Modifying text would create many large strings to garbage collect.

260

lineStart = 0

261

lineEnd = -1

262

while lineEnd < len(text) - 1:

263

lineEnd = text.find('\n', lineStart)

264

if lineEnd == -1:

265

lineEnd = len(text) - 1

266

line = text[lineStart:lineEnd + 1]

267

lineStart = lineEnd + 1

268

269

if line in lineHash:

270

chars.append(unichr(lineHash[line]))

271

else:

272

lineArray.append(line)

273

lineHash[line] = len(lineArray) - 1

274

chars.append(unichr(len(lineArray) - 1))

275

return "".join(chars)

276

277

chars1 = diff_linesToCharsMunge(text1)

278

chars2 = diff_linesToCharsMunge(text2)

279

return (chars1, chars2, lineArray)

280

281

def diff_charsToLines(self, diffs, lineArray):

282

"""Rehydrate the text in a diff from a string of line hashes to real lines

283

of text.

284

285

Args:

286

diffs: Array of diff tuples.

287

lineArray: Array of unique strings.

288

"""

289

for x in xrange(len(diffs)):

290

text = []

291

for char in diffs[x][1]:

292

text.append(lineArray[ord(char)])

293

diffs[x] = (diffs[x][0], "".join(text))

294

295

def diff_map(self, text1, text2):

296

"""Explore the intersection points between the two texts.

297

298

Args:

299

text1: Old string to be diffed.

300

text2: New string to be diffed.

301

302

Returns:

303

Array of diff tuples or None if no diff available.

304

"""

305

306

# Unlike in most languages, Python counts time in seconds.

307

s_end = time.time() + self.Diff_Timeout # Don't run for too long.

308

# Cache the text lengths to prevent multiple calls.

309

text1_length = len(text1)

310

text2_length = len(text2)

311

max_d = text1_length + text2_length - 1

312

doubleEnd = self.Diff_DualThreshold * 2 < max_d

313

v_map1 = []

314

v_map2 = []

315

v1 = {}

316

v2 = {}

317

v1[1] = 0

318

v2[1] = 0

319

footsteps = {}

320

done = False

321

# If the total number of characters is odd, then the front path will

322

# collide with the reverse path.

323

front = (text1_length + text2_length) % 2

324

for d in xrange(max_d):

325

# Bail out if timeout reached.

326

if self.Diff_Timeout > 0 and time.time() > s_end:

327

return None

328

329

# Walk the front path one step.

330

v_map1.append({})

331

for k in xrange(-d, d + 1, 2):

332

if k == -d or k != d and v1[k - 1] < v1[k + 1]:

333

x = v1[k + 1]

334

else:

335

x = v1[k - 1] + 1

336

y = x - k

337

if doubleEnd:

338

footstep = (x, y)

339

if front and footstep in footsteps:

340

done = True

341

if not front:

342

footsteps[footstep] = d

343

344

while (not done and x < text1_length and y < text2_length and

345

text1[x] == text2[y]):

346

x += 1

347

y += 1

348

if doubleEnd:

349

footstep = (x, y)

350

if front and footstep in footsteps:

351

done = True

352

if not front:

353

footsteps[footstep] = d

354

355

v1[k] = x

356

v_map1[d][(x, y)] = True

357

if x == text1_length and y == text2_length:

358

# Reached the end in single-path mode.

359

return self.diff_path1(v_map1, text1, text2)

360

elif done:

361

# Front path ran over reverse path.

362

v_map2 = v_map2[:footsteps[footstep] + 1]

363

a = self.diff_path1(v_map1, text1[:x], text2[:y])

364

b = self.diff_path2(v_map2, text1[x:], text2[y:])

365

return a + b

366

367

if doubleEnd:

368

# Walk the reverse path one step.

369

v_map2.append({})

370

for k in xrange(-d, d + 1, 2):

371

if k == -d or k != d and v2[k - 1] < v2[k + 1]:

372

x = v2[k + 1]

373

else:

374

x = v2[k - 1] + 1

375

y = x - k

376

footstep = (text1_length - x, text2_length - y)

377

if not front and footstep in footsteps:

378

done = True

379

if front:

380

footsteps[footstep] = d

381

while (not done and x < text1_length and y < text2_length and

382

text1[-x - 1] == text2[-y - 1]):

383

x += 1

384

y += 1

385

footstep = (text1_length - x, text2_length - y)

386

if not front and footstep in footsteps:

387

done = True

388

if front:

389

footsteps[footstep] = d

390

391

v2[k] = x

392

v_map2[d][(x, y)] = True

393

if done:

394

# Reverse path ran over front path.

395

v_map1 = v_map1[:footsteps[footstep] + 1]

396

a = self.diff_path1(v_map1, text1[:text1_length - x],

397

text2[:text2_length - y])

398

b = self.diff_path2(v_map2, text1[text1_length - x:],

399

text2[text2_length - y:])

400

return a + b

401

402

# Number of diffs equals number of characters, no commonality at all.

403

return None

404

405

def diff_path1(self, v_map, text1, text2):

406

"""Work from the middle back to the start to determine the path.

407

408

Args:

409

v_map: Array of paths.

410

text1: Old string fragment to be diffed.

411

text2: New string fragment to be diffed.

412

413

Returns:

414

Array of diff tuples.

415

"""

416

path = []

417

x = len(text1)

418

y = len(text2)

419

last_op = None

420

for d in xrange(len(v_map) - 2, -1, -1):

421

while True:

422

if (x - 1, y) in v_map[d]:

423

x -= 1

424

if last_op == self.DIFF_DELETE:

425

path[0] = (self.DIFF_DELETE, text1[x] + path[0][1])

426

else:

427

path[:0] = [(self.DIFF_DELETE, text1[x])]

428

last_op = self.DIFF_DELETE

429

break

430

elif (x, y - 1) in v_map[d]:

431

y -= 1

432

if last_op == self.DIFF_INSERT:

433

path[0] = (self.DIFF_INSERT, text2[y] + path[0][1])

434

else:

435

path[:0] = [(self.DIFF_INSERT, text2[y])]

436

last_op = self.DIFF_INSERT

437

break

438

else:

439

x -= 1

440

y -= 1

441

assert text1[x] == text2[y], ("No diagonal. " +

442

"Can't happen. (diff_path1)")

443

if last_op == self.DIFF_EQUAL:

444

path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1])

445

else:

446

path[:0] = [(self.DIFF_EQUAL, text1[x])]

447

last_op = self.DIFF_EQUAL

448

return path

449

450

def diff_path2(self, v_map, text1, text2):

451

"""Work from the middle back to the end to determine the path.

452

453

Args:

454

v_map: Array of paths.

455

text1: Old string fragment to be diffed.

456

text2: New string fragment to be diffed.

457

458

Returns:

459

Array of diff tuples.

460

"""

461

path = []

462

x = len(text1)

463

y = len(text2)

464

last_op = None

465

for d in xrange(len(v_map) - 2, -1, -1):

466

while True:

467

if (x - 1, y) in v_map[d]:

468

x -= 1

469

if last_op == self.DIFF_DELETE:

470

path[-1] = (self.DIFF_DELETE, path[-1][1] + text1[-x - 1])

471

else:

472

path.append((self.DIFF_DELETE, text1[-x - 1]))

473

last_op = self.DIFF_DELETE

474

break

475

elif (x, y - 1) in v_map[d]:

476

y -= 1

477

if last_op == self.DIFF_INSERT:

478

path[-1] = (self.DIFF_INSERT, path[-1][1] + text2[-y - 1])

479

else:

480

path.append((self.DIFF_INSERT, text2[-y - 1]))

481

last_op = self.DIFF_INSERT

482

break

483

else:

484

x -= 1

485

y -= 1

486

assert text1[-x - 1] == text2[-y - 1], ("No diagonal. " +

487

"Can't happen. (diff_path2)")

488

if last_op == self.DIFF_EQUAL:

489

path[-1] = (self.DIFF_EQUAL, path[-1][1] + text1[-x - 1])

490

else:

491

path.append((self.DIFF_EQUAL, text1[-x - 1]))

492

last_op = self.DIFF_EQUAL

493

return path

494

495

def diff_commonPrefix(self, text1, text2):

496

"""Determine the common prefix of two strings.

497

498

Args:

499

text1: First string.

500

text2: Second string.

501

502

Returns:

503

The number of characters common to the start of each string.

504

"""

505

# Quick check for common null cases.

506

if not text1 or not text2 or text1[0] != text2[0]:

507

return 0

508

# Binary search.

509

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

510

pointermin = 0

511

pointermax = min(len(text1), len(text2))

512

pointermid = pointermax

513

pointerstart = 0

514

while pointermin < pointermid:

515

if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:

516

pointermin = pointermid

517

pointerstart = pointermin

518

else:

519

pointermax = pointermid

520

pointermid = int((pointermax - pointermin) / 2 + pointermin)

521

return pointermid

522

523

def diff_commonSuffix(self, text1, text2):

524

"""Determine the common suffix of two strings.

525

526

Args:

527

text1: First string.

528

text2: Second string.

529

530

Returns:

531

The number of characters common to the end of each string.

532

"""

533

# Quick check for common null cases.

534

if not text1 or not text2 or text1[-1] != text2[-1]:

535

return 0

536

# Binary search.

537

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

538

pointermin = 0

539

pointermax = min(len(text1), len(text2))

540

pointermid = pointermax

541

pointerend = 0

542

while pointermin < pointermid:

543

if (text1[-pointermid:len(text1) - pointerend] ==

544

text2[-pointermid:len(text2) - pointerend]):

545

pointermin = pointermid

546

pointerend = pointermin

547

else:

548

pointermax = pointermid

549

pointermid = int((pointermax - pointermin) / 2 + pointermin)

550

return pointermid

551

552

def diff_halfMatch(self, text1, text2):

553

"""Do the two texts share a substring which is at least half the length of

554

the longer text?

555

556

Args:

557

text1: First string.

558

text2: Second string.

559

560

Returns:

561

Five element Array, containing the prefix of text1, the suffix of text1,

562

the prefix of text2, the suffix of text2 and the common middle. Or None

563

if there was no match.

564

"""

565

if len(text1) > len(text2):

566

(longtext, shorttext) = (text1, text2)

567

else:

568

(shorttext, longtext) = (text1, text2)

569

if len(longtext) < 10 or len(shorttext) < 1:

570

return None # Pointless.

571

572

def diff_halfMatchI(longtext, shorttext, i):

573

"""Does a substring of shorttext exist within longtext such that the

574

substring is at least half the length of longtext?

575

Closure, but does not reference any external variables.

576

577

Args:

578

longtext: Longer string.

579

shorttext: Shorter string.

580

i: Start index of quarter length substring within longtext.

581

582

Returns:

583

Five element Array, containing the prefix of longtext, the suffix of

584

longtext, the prefix of shorttext, the suffix of shorttext and the

585

common middle. Or None if there was no match.

586

"""

587

seed = longtext[i:i + len(longtext) / 4]

588

best_common = ''

589

j = shorttext.find(seed)

590

while j != -1:

591

prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])

592

suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])

593

if len(best_common) < suffixLength + prefixLength:

594

best_common = (shorttext[j - suffixLength:j] +

595

shorttext[j:j + prefixLength])

596

best_longtext_a = longtext[:i - suffixLength]

597

best_longtext_b = longtext[i + prefixLength:]

598

best_shorttext_a = shorttext[:j - suffixLength]

599

best_shorttext_b = shorttext[j + prefixLength:]

600

j = shorttext.find(seed, j + 1)

601

602

if len(best_common) >= len(longtext) / 2:

603

return (best_longtext_a, best_longtext_b,

604

best_shorttext_a, best_shorttext_b, best_common)

605

else:

606

return None

607

608

# First check if the second quarter is the seed for a half-match.

609

hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)

610

# Check again based on the third quarter.

611

hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)

612

if not hm1 and not hm2:

613

return None

614

elif not hm2:

615

hm = hm1

616

elif not hm1:

617

hm = hm2

618

else:

619

# Both matched. Select the longest.

620

if len(hm1[4]) > len(hm2[4]):

621

hm = hm1

622

else:

623

hm = hm2

624

625

# A half-match was found, sort out the return data.

626

if len(text1) > len(text2):

627

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

628

else:

629

(text2_a, text2_b, text1_a, text1_b, mid_common) = hm

630

return (text1_a, text1_b, text2_a, text2_b, mid_common)

631

632

def diff_cleanupSemantic(self, diffs):

633

"""Reduce the number of edits by eliminating semantically trivial

634

equalities.

635

636

Args:

637

diffs: Array of diff tuples.

638

"""

639

changes = False

640

equalities = [] # Stack of indices where equalities are found.

641

lastequality = None # Always equal to equalities[-1][1]

642

pointer = 0 # Index of current position.

643

length_changes1 = 0 # Number of chars that changed prior to the equality.

644

length_changes2 = 0 # Number of chars that changed after the equality.

645

while pointer < len(diffs):

646

if diffs[pointer][0] == self.DIFF_EQUAL: # equality found

647

equalities.append(pointer)

648

length_changes1 = length_changes2

649

length_changes2 = 0

650

lastequality = diffs[pointer][1]

651

else: # an insertion or deletion

652

length_changes2 += len(diffs[pointer][1])

653

if (lastequality != None and (len(lastequality) <= length_changes1) and

654

(len(lastequality) <= length_changes2)):

655

# Duplicate record

656

diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))

657

# Change second copy to insert.

658

diffs[equalities[-1] + 1] = (self.DIFF_INSERT,

659

diffs[equalities[-1] + 1][1])

660

# Throw away the equality we just deleted.

661

equalities.pop()

662

# Throw away the previous equality (it needs to be reevaluated).

663

if len(equalities) != 0:

664

equalities.pop()

665

if len(equalities):

666

pointer = equalities[-1]

667

else:

668

pointer = -1

669

length_changes1 = 0 # Reset the counters.

670

length_changes2 = 0

671

lastequality = None

672

changes = True

673

pointer += 1

674

675

if changes:

676

self.diff_cleanupMerge(diffs)

677

678

self.diff_cleanupSemanticLossless(diffs)

679

680

def diff_cleanupSemanticLossless(self, diffs):

681

"""Look for single edits surrounded on both sides by equalities

682

which can be shifted sideways to align the edit to a word boundary.

683

e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.

684

685

Args:

686

diffs: Array of diff tuples.

687

"""

688

689

def diff_cleanupSemanticScore(one, two):

690

"""Given two strings, compute a score representing whether the

691

internal boundary falls on logical boundaries.

692

Scores range from 5 (best) to 0 (worst).

693

Closure, but does not reference any external variables.

694

695

Args:

696

one: First string.

697

two: Second string.

698

699

Returns:

700

The score.

701

"""

702

if not one or not two:

703

# Edges are the best.

704

return 5

705

706

# Each port of this function behaves slightly differently due to

707

# subtle differences in each language's definition of things like

708

# 'whitespace'. Since this function's purpose is largely cosmetic,

709

# the choice has been made to use each language's native features

710

# rather than force total conformity.

711

score = 0

712

# One point for non-alphanumeric.

713

if not one[-1].isalnum() or not two[0].isalnum():

714

score += 1

715

# Two points for whitespace.

716

if one[-1].isspace() or two[0].isspace():

717

score += 1

718

# Three points for line breaks.

719

if (one[-1] == "\r" or one[-1] == "\n" or

720

two[0] == "\r" or two[0] == "\n"):

721

score += 1

722

# Four points for blank lines.

723

if (re.search("\\n\\r?\\n$", one) or

724

re.match("^\\r?\\n\\r?\\n", two)):

725

score += 1

726

return score

727

728

pointer = 1

729

# Intentionally ignore the first and last element (don't need checking).

730

while pointer < len(diffs) - 1:

731

if (diffs[pointer - 1][0] == self.DIFF_EQUAL and

732

diffs[pointer + 1][0] == self.DIFF_EQUAL):

733

# This is a single edit surrounded by equalities.

734

equality1 = diffs[pointer - 1][1]

735

edit = diffs[pointer][1]

736

equality2 = diffs[pointer + 1][1]

737

738

# First, shift the edit as far left as possible.

739

commonOffset = self.diff_commonSuffix(equality1, edit)

740

if commonOffset:

741

commonString = edit[-commonOffset:]

742

equality1 = equality1[:-commonOffset]

743

edit = commonString + edit[:-commonOffset]

744

equality2 = commonString + equality2

745

746

# Second, step character by character right, looking for the best fit.

747

bestEquality1 = equality1

748

bestEdit = edit

749

bestEquality2 = equality2

750

bestScore = (diff_cleanupSemanticScore(equality1, edit) +

751

diff_cleanupSemanticScore(edit, equality2))

752

while edit and equality2 and edit[0] == equality2[0]:

753

equality1 += edit[0]

754

edit = edit[1:] + equality2[0]

755

equality2 = equality2[1:]

756

score = (diff_cleanupSemanticScore(equality1, edit) +

757

diff_cleanupSemanticScore(edit, equality2))

758

# The >= encourages trailing rather than leading whitespace on edits.

759

if score >= bestScore:

760

bestScore = score

761

bestEquality1 = equality1

762

bestEdit = edit

763

bestEquality2 = equality2

764

765

if diffs[pointer - 1][1] != bestEquality1:

766

# We have an improvement, save it back to the diff.

767

if bestEquality1:

768

diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)

769

else:

770

del diffs[pointer - 1]

771

pointer -= 1

772

diffs[pointer] = (diffs[pointer][0], bestEdit)

773

if bestEquality2:

774

diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)

775

else:

776

del diffs[pointer + 1]

777

pointer -= 1

778

pointer += 1

779

780

def diff_cleanupEfficiency(self, diffs):

781

"""Reduce the number of edits by eliminating operationally trivial

782

equalities.

783

784

Args:

785

diffs: Array of diff tuples.

786

"""

787

changes = False

788

equalities = [] # Stack of indices where equalities are found.

789

lastequality = '' # Always equal to equalities[-1][1]

790

pointer = 0 # Index of current position.

791

pre_ins = False # Is there an insertion operation before the last equality.

792

pre_del = False # Is there a deletion operation before the last equality.

793

post_ins = False # Is there an insertion operation after the last equality.

794

post_del = False # Is there a deletion operation after the last equality.

795

while pointer < len(diffs):

796

if diffs[pointer][0] == self.DIFF_EQUAL: # equality found

797

if (len(diffs[pointer][1]) < self.Diff_EditCost and

798

(post_ins or post_del)):

799

# Candidate found.

800

equalities.append(pointer)

801

pre_ins = post_ins

802

pre_del = post_del

803

lastequality = diffs[pointer][1]

804

else:

805

# Not a candidate, and can never become one.

806

equalities = []

807

lastequality = ''

808

809

post_ins = post_del = False

810

else: # an insertion or deletion

811

if diffs[pointer][0] == self.DIFF_DELETE:

812

post_del = True

813

else:

814

post_ins = True

815

816

# Five types to be split:

817

# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>

818

# <ins>A</ins>X<ins>C</ins><del>D</del>

819

# <ins>A</ins><del>B</del>X<ins>C</ins>

820

# <ins>A</del>X<ins>C</ins><del>D</del>

821

# <ins>A</ins><del>B</del>X<del>C</del>

822

823

if lastequality and ((pre_ins and pre_del and post_ins and post_del) or

824

((len(lastequality) < self.Diff_EditCost / 2) and

825

(pre_ins + pre_del + post_ins + post_del) == 3)):

826

# Duplicate record

827

diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))

828

# Change second copy to insert.

829

diffs[equalities[-1] + 1] = (self.DIFF_INSERT,

830

diffs[equalities[-1] + 1][1])

831

equalities.pop() # Throw away the equality we just deleted

832

lastequality = ''

833

if pre_ins and pre_del:

834

# No changes made which could affect previous entry, keep going.

835

post_ins = post_del = True

836

equalities = []

837

else:

838

if len(equalities):

839

equalities.pop() # Throw away the previous equality

840

if len(equalities):

841

pointer = equalities[-1]

842

else:

843

pointer = -1

844

post_ins = post_del = False

845

changes = True

846

pointer += 1

847

848

if changes:

849

self.diff_cleanupMerge(diffs)

850

851

def diff_cleanupMerge(self, diffs):

852

"""Reorder and merge like edit sections. Merge equalities.

853

Any edit section can move as long as it doesn't cross an equality.

854

855

Args:

856

diffs: Array of diff tuples.

857

"""

858

diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.

859

pointer = 0

860

count_delete = 0

861

count_insert = 0

862

text_delete = ''

863

text_insert = ''

864

while pointer < len(diffs):

865

if diffs[pointer][0] == self.DIFF_INSERT:

866

count_insert += 1

867

text_insert += diffs[pointer][1]

868

pointer += 1

869

elif diffs[pointer][0] == self.DIFF_DELETE:

870

count_delete += 1

871

text_delete += diffs[pointer][1]

872

pointer += 1

873

elif diffs[pointer][0] == self.DIFF_EQUAL:

874

# Upon reaching an equality, check for prior redundancies.

875

if count_delete != 0 or count_insert != 0:

876

if count_delete != 0 and count_insert != 0:

877

# Factor out any common prefixies.

878

commonlength = self.diff_commonPrefix(text_insert, text_delete)

879

if commonlength != 0:

880

x = pointer - count_delete - count_insert - 1

881

if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:

882

diffs[x] = (diffs[x][0], diffs[x][1] +

883

text_insert[:commonlength])

884

else:

885

diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))

886

pointer += 1

887

text_insert = text_insert[commonlength:]

888

text_delete = text_delete[commonlength:]

889

# Factor out any common suffixies.

890

commonlength = self.diff_commonSuffix(text_insert, text_delete)

891

if commonlength != 0:

892

diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +

893

diffs[pointer][1])

894

text_insert = text_insert[:-commonlength]

895

text_delete = text_delete[:-commonlength]

896

# Delete the offending records and add the merged ones.

897

if count_delete == 0:

898

diffs[pointer - count_insert : pointer] = [

899

(self.DIFF_INSERT, text_insert)]

900

elif count_insert == 0:

901

diffs[pointer - count_delete : pointer] = [

902

(self.DIFF_DELETE, text_delete)]

903

else:

904

diffs[pointer - count_delete - count_insert : pointer] = [

905

(self.DIFF_DELETE, text_delete),

906

(self.DIFF_INSERT, text_insert)]

907

pointer = pointer - count_delete - count_insert + 1

908

if count_delete != 0:

909

pointer += 1

910

if count_insert != 0:

911

pointer += 1

912

elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:

913

# Merge this equality with the previous one.

914

diffs[pointer - 1] = (diffs[pointer - 1][0],

915

diffs[pointer - 1][1] + diffs[pointer][1])

916

del diffs[pointer]

917

else:

918

pointer += 1

919

920

count_insert = 0

921

count_delete = 0

922

text_delete = ''

923

text_insert = ''

924

925

if diffs[-1][1] == '':

926

diffs.pop() # Remove the dummy entry at the end.

927

928

# Second pass: look for single edits surrounded on both sides by equalities

929

# which can be shifted sideways to eliminate an equality.

930

# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC

931

changes = False

932

pointer = 1

933

# Intentionally ignore the first and last element (don't need checking).

934

while pointer < len(diffs) - 1:

935

if (diffs[pointer - 1][0] == self.DIFF_EQUAL and

936

diffs[pointer + 1][0] == self.DIFF_EQUAL):

937

# This is a single edit surrounded by equalities.

938

if diffs[pointer][1].endswith(diffs[pointer - 1][1]):

939

# Shift the edit over the previous equality.

940

diffs[pointer] = (diffs[pointer][0],

941

diffs[pointer - 1][1] +

942

diffs[pointer][1][:-len(diffs[pointer - 1][1])])

943

diffs[pointer + 1] = (diffs[pointer + 1][0],

944

diffs[pointer - 1][1] + diffs[pointer + 1][1])

945

del diffs[pointer - 1]

946

changes = True

947

elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):

948

# Shift the edit over the next equality.

949

diffs[pointer - 1] = (diffs[pointer - 1][0],

950

diffs[pointer - 1][1] + diffs[pointer + 1][1])

951

diffs[pointer] = (diffs[pointer][0],

952

diffs[pointer][1][len(diffs[pointer + 1][1]):] +

953

diffs[pointer + 1][1])

954

del diffs[pointer + 1]

955

changes = True

956

pointer += 1

957

958

# If shifts were made, the diff needs reordering and another shift sweep.

959

if changes:

960

self.diff_cleanupMerge(diffs)

961

962

def diff_xIndex(self, diffs, loc):

963

"""loc is a location in text1, compute and return the equivalent location

964

in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8

965

966

Args:

967

diffs: Array of diff tuples.

968

loc: Location within text1.

969

970

Returns:

971

Location within text2.

972

"""

973

chars1 = 0

974

chars2 = 0

975

last_chars1 = 0

976

last_chars2 = 0

977

for x in xrange(len(diffs)):

978

(op, text) = diffs[x]

979

if op != self.DIFF_INSERT: # Equality or deletion.

980

chars1 += len(text)

981

if op != self.DIFF_DELETE: # Equality or insertion.

982

chars2 += len(text)

983

if chars1 > loc: # Overshot the location.

984

break

985

last_chars1 = chars1

986

last_chars2 = chars2

987

988

if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:

989

# The location was deleted.

990

return last_chars2

991

# Add the remaining len(character).

992

return last_chars2 + (loc - last_chars1)

993

994

def diff_prettyHtml(self, diffs):

995

"""Convert a diff array into a pretty HTML report.

996

997

Args:

998

diffs: Array of diff tuples.

999

1000

Returns:

1001

HTML representation.

1002

"""

1003

html = []

1004

i = 0

1005

for (op, data) in diffs:

1006

text = (data.replace("&", "&").replace("<", "<")

1007

.replace(">", ">").replace("\n", "¶<BR>"))

1008

if op == self.DIFF_INSERT:

1009

html.append("<INS STYLE=\"background:#E6FFE6;\" TITLE=\"i=%i\">%s</INS>"

1010

% (i, text))

1011

elif op == self.DIFF_DELETE:

1012

html.append("<DEL STYLE=\"background:#FFE6E6;\" TITLE=\"i=%i\">%s</DEL>"

1013

% (i, text))

1014

elif op == self.DIFF_EQUAL:

1015

html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text))

1016

if op != self.DIFF_DELETE:

1017

i += len(data)

1018

return "".join(html)

1019

1020

def diff_text1(self, diffs):

1021

"""Compute and return the source text (all equalities and deletions).

1022

1023

Args:

1024

diffs: Array of diff tuples.

1025

1026

Returns:

1027

Source text.

1028

"""

1029

text = []

1030

for (op, data) in diffs:

1031

if op != self.DIFF_INSERT:

1032

text.append(data)

1033

return "".join(text)

1034

1035

def diff_text2(self, diffs):

1036

"""Compute and return the destination text (all equalities and insertions).

1037

1038

Args:

1039

diffs: Array of diff tuples.

1040

1041

Returns:

1042

Destination text.

1043

"""

1044

text = []

1045

for (op, data) in diffs:

1046

if op != self.DIFF_DELETE:

1047

text.append(data)

1048

return "".join(text)

1049

1050

def diff_levenshtein(self, diffs):

1051

"""Compute the Levenshtein distance; the number of inserted, deleted or

1052

substituted characters.

1053

1054

Args:

1055

diffs: Array of diff tuples.

1056

1057

Returns:

1058

Number of changes.

1059

"""

1060

levenshtein = 0

1061

insertions = 0

1062

deletions = 0

1063

for (op, data) in diffs:

1064

if op == self.DIFF_INSERT:

1065

insertions += len(data)

1066

elif op == self.DIFF_DELETE:

1067

deletions += len(data)

1068

elif op == self.DIFF_EQUAL:

1069

# A deletion and an insertion is one substitution.

1070

levenshtein += max(insertions, deletions)

1071

insertions = 0

1072

deletions = 0

1073

levenshtein += max(insertions, deletions)

1074

return levenshtein

1075

1076

def diff_toDelta(self, diffs):

1077

"""Crush the diff into an encoded string which describes the operations

1078

required to transform text1 into text2.

1079

E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.

1080

Operations are tab-separated. Inserted text is escaped using %xx notation.

1081

1082

Args:

1083

diffs: Array of diff tuples.

1084

1085

Returns:

1086

Delta text.

1087

"""

1088

text = []

1089

for (op, data) in diffs:

1090

if op == self.DIFF_INSERT:

1091

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

1092

data = data.encode("utf-8")

1093

text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))

1094

elif op == self.DIFF_DELETE:

1095

text.append("-%d" % len(data))

1096

elif op == self.DIFF_EQUAL:

1097

text.append("=%d" % len(data))

1098

return "\t".join(text)

1099

1100

def diff_fromDelta(self, text1, delta):

1101

"""Given the original text1, and an encoded string which describes the

1102

operations required to transform text1 into text2, compute the full diff.

1103

1104

Args:

1105

text1: Source string for the diff.

1106

delta: Delta text.

1107

1108

Returns:

1109

Array of diff tuples.

1110

1111

Raises:

1112

ValueError: If invalid input.

1113

"""

1114

if type(delta) == unicode:

1115

# Deltas should be composed of a subset of ascii chars, Unicode not

1116

# required. If this encode raises UnicodeEncodeError, delta is invalid.

1117

delta = delta.encode("ascii")

1118

diffs = []

1119

pointer = 0 # Cursor in text1

1120

tokens = delta.split("\t")

1121

for token in tokens:

1122

if token == "":

1123

# Blank tokens are ok (from a trailing \t).

1124

continue

1125

# Each token begins with a one character parameter which specifies the

1126

# operation of this token (delete, insert, equality).

1127

param = token[1:]

1128

if token[0] == "+":

1129

param = urllib.unquote(param).decode("utf-8")

1130

diffs.append((self.DIFF_INSERT, param))

1131

elif token[0] == "-" or token[0] == "=":

1132

try:

1133

n = int(param)

1134

except ValueError:

1135

raise ValueError, "Invalid number in diff_fromDelta: " + param

1136

if n < 0:

1137

raise ValueError, "Negative number in diff_fromDelta: " + param

1138

text = text1[pointer : pointer + n]

1139

pointer += n

1140

if token[0] == "=":

1141

diffs.append((self.DIFF_EQUAL, text))

1142

else:

1143

diffs.append((self.DIFF_DELETE, text))

1144

else:

1145

# Anything else is an error.

1146

raise ValueError, ("Invalid diff operation in diff_fromDelta: " +

1147

token[0])

1148

if pointer != len(text1):

1149

raise ValueError, (

1150

"Delta length (%d) does not equal source text length (%d)." %

1151

(pointer, len(text1)))

1152

return diffs

1153

1154

# MATCH FUNCTIONS

1155

1156

def match_main(self, text, pattern, loc):

1157

"""Locate the best instance of 'pattern' in 'text' near 'loc'.

1158

1159

Args:

1160

text: The text to search.

1161

pattern: The pattern to search for.

1162

loc: The location to search around.

1163

1164

Returns:

1165

Best match index or -1.

1166

"""

1167

loc = max(0, min(loc, len(text)))

1168

if text == pattern:

1169

# Shortcut (potentially not guaranteed by the algorithm)

1170

return 0

1171

elif not text:

1172

# Nothing to match.

1173

return -1

1174

elif text[loc:loc + len(pattern)] == pattern:

1175

# Perfect match at the perfect spot! (Includes case of null pattern)

1176

return loc

1177

else:

1178

# Do a fuzzy compare.

1179

match = self.match_bitap(text, pattern, loc)

1180

return match

1181

1182

def match_bitap(self, text, pattern, loc):

1183

"""Locate the best instance of 'pattern' in 'text' near 'loc' using the

1184

Bitap algorithm.

1185

1186

Args:

1187

text: The text to search.

1188

pattern: The pattern to search for.

1189

loc: The location to search around.

1190

1191

Returns:

1192

Best match index or -1.

1193

"""

1194

# Python doesn't have a maxint limit, so ignore this check.

1195

#if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:

1196

# raise ValueError("Pattern too long for this application.")

1197

1198

# Initialise the alphabet.

1199

s = self.match_alphabet(pattern)

1200

1201

def match_bitapScore(e, x):

1202

"""Compute and return the score for a match with e errors and x location.

1203

Accesses loc and pattern through being a closure.

1204

1205

Args:

1206

e: Number of errors in match.

1207

x: Location of match.

1208

1209

Returns:

1210

Overall score for match (0.0 = good, 1.0 = bad).

1211

"""

1212

accuracy = float(e) / len(pattern)

1213

proximity = abs(loc - x)

1214

if not self.Match_Distance:

1215

# Dodge divide by zero error.

1216

return proximity and 1.0 or accuracy

1217

return accuracy + (proximity / float(self.Match_Distance))

1218

1219

# Highest score beyond which we give up.

1220

score_threshold = self.Match_Threshold

1221

# Is there a nearby exact match? (speedup)

1222

best_loc = text.find(pattern, loc)

1223

if best_loc != -1:

1224

score_threshold = min(match_bitapScore(0, best_loc), score_threshold)

1225

# What about in the other direction? (speedup)

1226

best_loc = text.rfind(pattern, loc + len(pattern))

1227

if best_loc != -1:

1228

score_threshold = min(match_bitapScore(0, best_loc), score_threshold)

1229

1230

# Initialise the bit arrays.

1231

matchmask = 1 << (len(pattern) - 1)

1232

best_loc = -1

1233

1234

bin_max = len(pattern) + len(text)

1235

# Empty initialization added to appease pychecker.

1236

last_rd = None

1237

for d in xrange(len(pattern)):

1238

# Scan for the best match each iteration allows for one more error.

1239

# Run a binary search to determine how far from 'loc' we can stray at

1240

# this error level.

1241

bin_min = 0

1242

bin_mid = bin_max

1243

while bin_min < bin_mid:

1244

if match_bitapScore(d, loc + bin_mid) <= score_threshold:

1245

bin_min = bin_mid

1246

else:

1247

bin_max = bin_mid

1248

bin_mid = (bin_max - bin_min) / 2 + bin_min

1249

1250

# Use the result from this iteration as the maximum for the next.

1251

bin_max = bin_mid

1252

start = max(1, loc - bin_mid + 1)

1253

finish = min(loc + bin_mid, len(text)) + len(pattern)

1254

1255

rd = range(finish + 1)

1256

rd.append((1 << d) - 1)

1257

for j in xrange(finish, start - 1, -1):

1258

if len(text) <= j - 1:

1259

# Out of range.

1260

charMatch = 0

1261

else:

1262

charMatch = s.get(text[j - 1], 0)

1263

if d == 0: # First pass: exact match.

1264

rd[j] = ((rd[j + 1] << 1) | 1) & charMatch

1265

else: # Subsequent passes: fuzzy match.

1266

rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (

1267

((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]

1268

if rd[j] & matchmask:

1269

score = match_bitapScore(d, j - 1)

1270

# This match will almost certainly be better than any existing match.

1271

# But check anyway.

1272

if score <= score_threshold:

1273

# Told you so.

1274

score_threshold = score

1275

best_loc = j - 1

1276

if best_loc > loc:

1277

# When passing loc, don't exceed our current distance from loc.

1278

start = max(1, 2 * loc - best_loc)

1279

else:

1280

# Already passed loc, downhill from here on in.

1281

break

1282

# No hope for a (better) match at greater error levels.

1283

if match_bitapScore(d + 1, loc) > score_threshold:

1284

break

1285

last_rd = rd

1286

return best_loc

1287

1288

def match_alphabet(self, pattern):

1289

"""Initialise the alphabet for the Bitap algorithm.

1290

1291

Args:

1292

pattern: The text to encode.

1293

1294

Returns:

1295

Hash of character locations.

1296

"""

1297

s = {}

1298

for char in pattern:

1299

s[char] = 0

1300

for i in xrange(len(pattern)):

1301

s[pattern[i]] |= 1 << (len(pattern) - i - 1)

1302

return s

1303

1304

# PATCH FUNCTIONS

1305

1306

def patch_addContext(self, patch, text):

1307

"""Increase the context until it is unique,

1308

but don't let the pattern expand beyond Match_MaxBits.

1309

1310

Args:

1311

patch: The patch to grow.

1312

text: Source text.

1313

"""

1314

if len(text) == 0:

1315

return

1316

pattern = text[patch.start2 : patch.start2 + patch.length1]

1317

padding = 0

1318

1319

# Look for the first and last matches of pattern in text. If two different

1320

# matches are found, increase the pattern length.

1321

while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==

1322

0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -

1323

self.Patch_Margin)):

1324

padding += self.Patch_Margin

1325

pattern = text[max(0, patch.start2 - padding) :

1326

patch.start2 + patch.length1 + padding]

1327

# Add one chunk for good luck.

1328

padding += self.Patch_Margin

1329

1330

# Add the prefix.

1331

prefix = text[max(0, patch.start2 - padding) : patch.start2]

1332

if prefix:

1333

patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]

1334

# Add the suffix.

1335

suffix = text[patch.start2 + patch.length1 :

1336

patch.start2 + patch.length1 + padding]

1337

if suffix:

1338

patch.diffs.append((self.DIFF_EQUAL, suffix))

1339

1340

# Roll back the start points.

1341

patch.start1 -= len(prefix)

1342

patch.start2 -= len(prefix)

1343

# Extend lengths.

1344

patch.length1 += len(prefix) + len(suffix)

1345

patch.length2 += len(prefix) + len(suffix)

1346

1347

def patch_make(self, a, b=None, c=None):

1348

"""Compute a list of patches to turn text1 into text2.

1349

Use diffs if provided, otherwise compute it ourselves.

1350

There are four ways to call this function, depending on what data is

1351

available to the caller:

1352

Method 1:

1353

a = text1, b = text2

1354

Method 2:

1355

a = diffs

1356

Method 3 (optimal):

1357

a = text1, b = diffs

1358

Method 4 (deprecated, use method 3):

1359

a = text1, b = text2, c = diffs

1360

1361

Args:

1362

a: text1 (methods 1,3,4) or Array of diff tuples for text1 to

1363

text2 (method 2).

1364

b: text2 (methods 1,4) or Array of diff tuples for text1 to

1365

text2 (method 3) or undefined (method 2).

1366

c: Array of diff tuples for text1 to text2 (method 4) or

1367

undefined (methods 1,2,3).

1368

1369

Returns:

1370

Array of patch objects.

1371

"""

1372

text1 = None

1373

diffs = None

1374

# Note that texts may arrive as 'str' or 'unicode'.

1375

if isinstance(a, basestring) and isinstance(b, basestring) and c is None:

1376

# Method 1: text1, text2

1377

# Compute diffs from text1 and text2.

1378

text1 = a

1379

diffs = self.diff_main(text1, b, True)

1380

if len(diffs) > 2:

1381

self.diff_cleanupSemantic(diffs)

1382

self.diff_cleanupEfficiency(diffs)

1383

elif isinstance(a, list) and b is None and c is None:

1384

# Method 2: diffs

1385

# Compute text1 from diffs.

1386

diffs = a

1387

text1 = self.diff_text1(diffs)

1388

elif isinstance(a, basestring) and isinstance(b, list) and c is None:

1389

# Method 3: text1, diffs

1390

text1 = a

1391

diffs = b

1392

elif (isinstance(a, basestring) and isinstance(b, basestring) and

1393

isinstance(c, list)):

1394

# Method 4: text1, text2, diffs

1395

# text2 is not used.

1396

text1 = a

1397

diffs = c

1398

else:

1399

raise ValueError("Unknown call format to patch_make.")

1400

1401

if not diffs:

1402

return [] # Get rid of the None case.

1403

patches = []

1404

patch = patch_obj()

1405

char_count1 = 0 # Number of characters into the text1 string.

1406

char_count2 = 0 # Number of characters into the text2 string.

1407

prepatch_text = text1 # Recreate the patches to determine context info.

1408

postpatch_text = text1

1409

for x in xrange(len(diffs)):

1410

(diff_type, diff_text) = diffs[x]

1411

if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:

1412

# A new patch starts here.

1413

patch.start1 = char_count1

1414

patch.start2 = char_count2

1415

if diff_type == self.DIFF_INSERT:

1416

# Insertion

1417

patch.diffs.append(diffs[x])

1418

patch.length2 += len(diff_text)

1419

postpatch_text = (postpatch_text[:char_count2] + diff_text +

1420

postpatch_text[char_count2:])

1421

elif diff_type == self.DIFF_DELETE:

1422

# Deletion.

1423

patch.length1 += len(diff_text)

1424

patch.diffs.append(diffs[x])

1425

postpatch_text = (postpatch_text[:char_count2] +

1426

postpatch_text[char_count2 + len(diff_text):])

1427

elif (diff_type == self.DIFF_EQUAL and

1428

len(diff_text) <= 2 * self.Patch_Margin and

1429

len(patch.diffs) != 0 and len(diffs) != x + 1):

1430

# Small equality inside a patch.

1431

patch.diffs.append(diffs[x])

1432

patch.length1 += len(diff_text)

1433

patch.length2 += len(diff_text)

1434

1435

if (diff_type == self.DIFF_EQUAL and

1436

len(diff_text) >= 2 * self.Patch_Margin):

1437

# Time for a new patch.

1438

if len(patch.diffs) != 0:

1439

self.patch_addContext(patch, prepatch_text)

1440

patches.append(patch)

1441

patch = patch_obj()

1442

# Unlike Unidiff, our patch lists have a rolling context.

1443

# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff

1444

# Update prepatch text & pos to reflect the application of the

1445

# just completed patch.

1446

prepatch_text = postpatch_text

1447

char_count1 = char_count2

1448

1449

# Update the current character count.

1450

if diff_type != self.DIFF_INSERT:

1451

char_count1 += len(diff_text)

1452

if diff_type != self.DIFF_DELETE:

1453

char_count2 += len(diff_text)

1454

1455

# Pick up the leftover patch if not empty.

1456

if len(patch.diffs) != 0:

1457

self.patch_addContext(patch, prepatch_text)

1458

patches.append(patch)

1459

return patches

1460

1461

def patch_deepCopy(self, patches):

1462

"""Given an array of patches, return another array that is identical.

1463

1464

Args:

1465

patches: Array of patch objects.

1466

1467

Returns:

1468

Array of patch objects.

1469

"""

1470

patchesCopy = []

1471

for patch in patches:

1472

patchCopy = patch_obj()

1473

# No need to deep copy the tuples since they are immutable.

1474

patchCopy.diffs = patch.diffs[:]

1475

patchCopy.start1 = patch.start1

1476

patchCopy.start2 = patch.start2

1477

patchCopy.length1 = patch.length1

1478

patchCopy.length2 = patch.length2

1479

patchesCopy.append(patchCopy)

1480

return patchesCopy

1481

1482

def patch_apply(self, patches, text):

1483

"""Merge a set of patches onto the text. Return a patched text, as well

1484

as a list of true/false values indicating which patches were applied.

1485

1486

Args:

1487

patches: Array of patch objects.

1488

text: Old text.

1489

1490

Returns:

1491

Two element Array, containing the new text and an array of boolean values.

1492

"""

1493

if not patches:

1494

return (text, [])

1495

1496

# Deep copy the patches so that no changes are made to originals.

1497

patches = self.patch_deepCopy(patches)

1498

1499

nullPadding = self.patch_addPadding(patches)

1500

text = nullPadding + text + nullPadding

1501

self.patch_splitMax(patches)

1502

1503

# delta keeps track of the offset between the expected and actual location

1504

# of the previous patch. If there are patches expected at positions 10 and

1505

# 20, but the first patch was found at 12, delta is 2 and the second patch

1506

# has an effective expected position of 22.

1507

delta = 0

1508

results = []

1509

for patch in patches:

1510

expected_loc = patch.start2 + delta

1511

text1 = self.diff_text1(patch.diffs)

1512

end_loc = -1

1513

if len(text1) > self.Match_MaxBits:

1514

# patch_splitMax will only provide an oversized pattern in the case of

1515

# a monster delete.

1516

start_loc = self.match_main(text, text1[:self.Match_MaxBits],

1517

expected_loc)

1518

if start_loc != -1:

1519

end_loc = self.match_main(text, text1[-self.Match_MaxBits:],

1520

expected_loc + len(text1) - self.Match_MaxBits)

1521

if end_loc == -1 or start_loc >= end_loc:

1522

# Can't find valid trailing context. Drop this patch.

1523

start_loc = -1

1524

else:

1525

start_loc = self.match_main(text, text1, expected_loc)

1526

if start_loc == -1:

1527

# No match found. :(

1528

results.append(False)

1529

# Subtract the delta for this failed patch from subsequent patches.

1530

delta -= patch.length2 - patch.length1

1531

else:

1532

# Found a match. :)

1533

results.append(True)

1534

delta = start_loc - expected_loc

1535

if end_loc == -1:

1536

text2 = text[start_loc : start_loc + len(text1)]

1537

else:

1538

text2 = text[start_loc : end_loc + self.Match_MaxBits]

1539

if text1 == text2:

1540

# Perfect match, just shove the replacement text in.

1541

text = (text[:start_loc] + self.diff_text2(patch.diffs) +

1542

text[start_loc + len(text1):])

1543

else:

1544

# Imperfect match.

1545

# Run a diff to get a framework of equivalent indices.

1546

diffs = self.diff_main(text1, text2, False)

1547

if (len(text1) > self.Match_MaxBits and

1548

self.diff_levenshtein(diffs) / float(len(text1)) >

1549

self.Patch_DeleteThreshold):

1550

# The end points match, but the content is unacceptably bad.

1551

results[-1] = False

1552

else:

1553

self.diff_cleanupSemanticLossless(diffs)

1554

index1 = 0

1555

for (op, data) in patch.diffs:

1556

if op != self.DIFF_EQUAL:

1557

index2 = self.diff_xIndex(diffs, index1)

1558

if op == self.DIFF_INSERT: # Insertion

1559

text = text[:start_loc + index2] + data + text[start_loc +

1560

index2:]

1561

elif op == self.DIFF_DELETE: # Deletion

1562

text = text[:start_loc + index2] + text[start_loc +

1563

self.diff_xIndex(diffs, index1 + len(data)):]

1564

if op != self.DIFF_DELETE:

1565

index1 += len(data)

1566

# Strip the padding off.

1567

text = text[len(nullPadding):-len(nullPadding)]

1568

return (text, results)

1569

1570

def patch_addPadding(self, patches):

1571

"""Add some padding on text start and end so that edges can match

1572

something. Intended to be called only from within patch_apply.

1573

1574

Args:

1575

patches: Array of patch objects.

1576

1577

Returns:

1578

The padding string added to each side.

1579

"""

1580

paddingLength = self.Patch_Margin

1581

nullPadding = ""

1582

for x in xrange(1, paddingLength + 1):

1583

nullPadding += chr(x)

1584

1585

# Bump all the patches forward.

1586

for patch in patches:

1587

patch.start1 += paddingLength

1588

patch.start2 += paddingLength

1589

1590

# Add some padding on start of first diff.

1591

patch = patches[0]

1592

diffs = patch.diffs

1593

if not diffs or diffs[0][0] != self.DIFF_EQUAL:

1594

# Add nullPadding equality.

1595

diffs.insert(0, (self.DIFF_EQUAL, nullPadding))

1596

patch.start1 -= paddingLength # Should be 0.

1597

patch.start2 -= paddingLength # Should be 0.

1598

patch.length1 += paddingLength

1599

patch.length2 += paddingLength

1600

elif paddingLength > len(diffs[0][1]):

1601

# Grow first equality.

1602

extraLength = paddingLength - len(diffs[0][1])

1603

newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]

1604

diffs[0] = (diffs[0][0], newText)

1605

patch.start1 -= extraLength

1606

patch.start2 -= extraLength

1607

patch.length1 += extraLength

1608

patch.length2 += extraLength

1609

1610

# Add some padding on end of last diff.

1611

patch = patches[-1]

1612

diffs = patch.diffs

1613

if not diffs or diffs[-1][0] != self.DIFF_EQUAL:

1614

# Add nullPadding equality.

1615

diffs.append((self.DIFF_EQUAL, nullPadding))

1616

patch.length1 += paddingLength

1617

patch.length2 += paddingLength

1618

elif paddingLength > len(diffs[-1][1]):

1619

# Grow last equality.

1620

extraLength = paddingLength - len(diffs[-1][1])

1621

newText = diffs[-1][1] + nullPadding[:extraLength]

1622

diffs[-1] = (diffs[-1][0], newText)

1623

patch.length1 += extraLength

1624

patch.length2 += extraLength

1625

1626

return nullPadding

1627

1628

def patch_splitMax(self, patches):

1629

"""Look through the patches and break up any which are longer than the

1630

maximum limit of the match algorithm.

1631

1632

Args:

1633

patches: Array of patch objects.

1634

"""

1635

if self.Match_MaxBits == 0:

1636

return

1637

for x in xrange(len(patches)):

1638

if patches[x].length1 > self.Match_MaxBits:

1639

bigpatch = patches[x]

1640

# Remove the big old patch.

1641

del patches[x]

1642

x -= 1

1643

patch_size = self.Match_MaxBits

1644

start1 = bigpatch.start1

1645

start2 = bigpatch.start2

1646

precontext = ''

1647

while len(bigpatch.diffs) != 0:

1648

# Create one of several smaller patches.

1649

patch = patch_obj()

1650

empty = True

1651

patch.start1 = start1 - len(precontext)

1652

patch.start2 = start2 - len(precontext)

1653

if precontext:

1654

patch.length1 = patch.length2 = len(precontext)

1655

patch.diffs.append((self.DIFF_EQUAL, precontext))

1656

1657

while (len(bigpatch.diffs) != 0 and

1658

patch.length1 < patch_size - self.Patch_Margin):

1659

(diff_type, diff_text) = bigpatch.diffs[0]

1660

if diff_type == self.DIFF_INSERT:

1661

# Insertions are harmless.

1662

patch.length2 += len(diff_text)

1663

start2 += len(diff_text)

1664

patch.diffs.append(bigpatch.diffs.pop(0))

1665

empty = False

1666

elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and

1667

patch.diffs[0][0] == self.DIFF_EQUAL and

1668

len(diff_text) > 2 * patch_size):

1669

# This is a large deletion. Let it pass in one chunk.

1670

patch.length1 += len(diff_text)

1671

start1 += len(diff_text)

1672

empty = False

1673

patch.diffs.append((diff_type, diff_text))

1674

del bigpatch.diffs[0]

1675

else:

1676

# Deletion or equality. Only take as much as we can stomach.

1677

diff_text = diff_text[:patch_size - patch.length1 -

1678

self.Patch_Margin]

1679

patch.length1 += len(diff_text)

1680

start1 += len(diff_text)

1681

if diff_type == self.DIFF_EQUAL:

1682

patch.length2 += len(diff_text)

1683

start2 += len(diff_text)

1684

else:

1685

empty = False

1686

1687

patch.diffs.append((diff_type, diff_text))

1688

if diff_text == bigpatch.diffs[0][1]:

1689

del bigpatch.diffs[0]

1690

else:

1691

bigpatch.diffs[0] = (bigpatch.diffs[0][0],

1692

bigpatch.diffs[0][1][len(diff_text):])

1693

1694

# Compute the head context for the next patch.

1695

precontext = self.diff_text2(patch.diffs)

1696

precontext = precontext[-self.Patch_Margin:]

1697

# Append the end context for this patch.

1698

postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]

1699

if postcontext:

1700

patch.length1 += len(postcontext)

1701

patch.length2 += len(postcontext)

1702

if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:

1703

patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +

1704

postcontext)

1705

else:

1706

patch.diffs.append((self.DIFF_EQUAL, postcontext))

1707

1708

if not empty:

1709

x += 1

1710

patches.insert(x, patch)

1711

1712

def patch_toText(self, patches):

1713

"""Take a list of patches and return a textual representation.

1714

1715

Args:

1716

patches: Array of patch objects.

1717

1718

Returns:

1719

Text representation of patches.

1720

"""

1721

text = []

1722

for patch in patches:

1723

text.append(str(patch))

1724

return "".join(text)

1725

1726

def patch_fromText(self, textline):

1727

"""Parse a textual representation of patches and return a list of patch

1728

objects.

1729

1730

Args:

1731

textline: Text representation of patches.

1732

1733

Returns:

1734

Array of patch objects.

1735

1736

Raises:

1737

ValueError: If invalid input.

1738

"""

1739

if type(textline) == unicode:

1740

# Patches should be composed of a subset of ascii chars, Unicode not

1741

# required. If this encode raises UnicodeEncodeError, patch is invalid.

1742

textline = textline.encode("ascii")

1743

patches = []

1744

if not textline:

1745

return patches

1746

text = textline.split('\n')

1747

while len(text) != 0:

1748

m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])

1749

if not m:

1750

raise ValueError, "Invalid patch string: " + text[0]

1751

patch = patch_obj()

1752

patches.append(patch)

1753

patch.start1 = int(m.group(1))

1754

if m.group(2) == '':

1755

patch.start1 -= 1

1756

patch.length1 = 1

1757

elif m.group(2) == '0':

1758

patch.length1 = 0

1759

else:

1760

patch.start1 -= 1

1761

patch.length1 = int(m.group(2))

1762

1763

patch.start2 = int(m.group(3))

1764

if m.group(4) == '':

1765

patch.start2 -= 1

1766

patch.length2 = 1

1767

elif m.group(4) == '0':

1768

patch.length2 = 0

1769

else:

1770

patch.start2 -= 1

1771

patch.length2 = int(m.group(4))

1772

1773

del text[0]

1774

1775

while len(text) != 0:

1776

if text[0]:

1777

sign = text[0][0]

1778

else:

1779

sign = ''

1780

line = urllib.unquote(text[0][1:])

1781

line = line.decode("utf-8")

1782

if sign == '+':

1783

# Insertion.

1784

patch.diffs.append((self.DIFF_INSERT, line))

1785

elif sign == '-':

1786

# Deletion.

1787

patch.diffs.append((self.DIFF_DELETE, line))

1788

elif sign == ' ':

1789

# Minor equality.

1790

patch.diffs.append((self.DIFF_EQUAL, line))

1791

elif sign == '@':

1792

# Start of next patch.

1793

break

1794

elif sign == '':

1795

# Blank line? Whatever.

1796

pass

1797

else:

1798

# WTF?

1799

raise ValueError, "Invalid patch mode: '%s'\n%s" % (sign, line)

1800

del text[0]

1801

return patches

1802

1803

1804

class patch_obj:

1805

"""Class representing one patch operation.

1806

"""

1807

1808

def __init__(self):

1809

"""Initializes with an empty list of diffs.

1810

"""

1811

self.diffs = []

1812

self.start1 = None

1813

self.start2 = None

1814

self.length1 = 0

1815

self.length2 = 0

1816

1817

def __str__(self):

1818

"""Emmulate GNU diff's format.

1819

Header: @@ -382,8 +481,9 @@

1820

Indicies are printed as 1-based, not 0-based.

1821

1822

Returns:

1823

The GNU diff string.

1824

"""

1825

if self.length1 == 0:

1826

coords1 = str(self.start1) + ",0"

1827

elif self.length1 == 1:

1828

coords1 = str(self.start1 + 1)

1829

else:

1830

coords1 = str(self.start1 + 1) + "," + str(self.length1)

1831

if self.length2 == 0:

1832

coords2 = str(self.start2) + ",0"

1833

elif self.length2 == 1:

1834

coords2 = str(self.start2 + 1)

1835

else:

1836

coords2 = str(self.start2 + 1) + "," + str(self.length2)

1837

text = ["@@ -", coords1, " +", coords2, " @@\n"]

1838

# Escape the body of the patch with %xx notation.

1839

for (op, data) in self.diffs:

1840

if op == diff_match_patch.DIFF_INSERT:

1841

text.append("+")

1842

elif op == diff_match_patch.DIFF_DELETE:

1843

text.append("-")

1844

elif op == diff_match_patch.DIFF_EQUAL:

1845

text.append(" ")

1846

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

1847

data = data.encode("utf-8")

1848

text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n")

1849

return "".join(text)