~widelands-dev/widelands-website/django_staticfiles : revision 443

33

34

import urllib

34

35

import re

35

36

37

36

38

class diff_match_patch:

37

"""Class containing the diff, match and patch methods.

38

39

Also contains the behaviour settings.

40

"""

41

42

def __init__(self):

43

"""Inits a diff_match_patch object with default settings.

44

Redefine these in your program to override the defaults.

45

"""

46

47

# Number of seconds to map a diff before giving up (0 for infinity).

48

self.Diff_Timeout = 1.0

49

# Cost of an empty edit operation in terms of edit characters.

50

self.Diff_EditCost = 4

51

# The size beyond which the double-ended diff activates.

52

# Double-ending is twice as fast, but less accurate.

53

self.Diff_DualThreshold = 32

54

# At what point is no match declared (0.0 = perfection, 1.0 = very loose).

55

self.Match_Threshold = 0.5

56

# How far to search for a match (0 = exact location, 1000+ = broad match).

57

# A match this many characters away from the expected location will add

58

# 1.0 to the score (0.0 is a perfect match).

59

self.Match_Distance = 1000

60

# When deleting a large block of text (over ~64 characters), how close does

61

# the contents have to match the expected contents. (0.0 = perfection,

62

# 1.0 = very loose). Note that Match_Threshold controls how closely the

63

# end points of a delete need to match.

64

self.Patch_DeleteThreshold = 0.5

65

# Chunk size for context length.

66

self.Patch_Margin = 4

67

68

# How many bits in a number?

69

# Python has no maximum, thus to disable patch splitting set to 0.

70

# However to avoid long patches in certain pathological cases, use 32.

71

# Multiple short patches (using native ints) are much faster than long ones.

72

self.Match_MaxBits = 32

73

74

# DIFF FUNCTIONS

75

76

# The data structure representing a diff is an array of tuples:

77

# [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]

78

# which means: delete "Hello", add "Goodbye" and keep " world."

79

DIFF_DELETE = -1

80

DIFF_INSERT = 1

81

DIFF_EQUAL = 0

82

83

def diff_main(self, text1, text2, checklines=True):

84

"""Find the differences between two texts. Simplifies the problem by

85

stripping any common prefix or suffix off the texts before diffing.

86

87

Args:

88

text1: Old string to be diffed.

89

text2: New string to be diffed.

90

checklines: Optional speedup flag. If present and false, then don't run

91

a line-level diff first to identify the changed areas.

92

Defaults to true, which does a faster, slightly less optimal diff.

93

94

Returns:

95

Array of changes.

96

"""

97

98

# Check for equality (speedup)

99

if text1 == text2:

100

return [(self.DIFF_EQUAL, text1)]

101

102

# Trim off common prefix (speedup)

103

commonlength = self.diff_commonPrefix(text1, text2)

104

commonprefix = text1[:commonlength]

105

text1 = text1[commonlength:]

106

text2 = text2[commonlength:]

107

108

# Trim off common suffix (speedup)

109

commonlength = self.diff_commonSuffix(text1, text2)

110

if commonlength == 0:

111

commonsuffix = ''

112

else:

113

commonsuffix = text1[-commonlength:]

114

text1 = text1[:-commonlength]

115

text2 = text2[:-commonlength]

116

117

# Compute the diff on the middle block

118

diffs = self.diff_compute(text1, text2, checklines)

119

120

# Restore the prefix and suffix

121

if commonprefix:

122

diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]

123

if commonsuffix:

124

diffs.append((self.DIFF_EQUAL, commonsuffix))

125

self.diff_cleanupMerge(diffs)

126

return diffs

127

128

def diff_compute(self, text1, text2, checklines):

129

"""Find the differences between two texts. Assumes that the texts do not

130

have any common prefix or suffix.

131

132

Args:

133

text1: Old string to be diffed.

134

text2: New string to be diffed.

135

checklines: Speedup flag. If false, then don't run a line-level diff

136

first to identify the changed areas.

137

If true, then run a faster, slightly less optimal diff.

138

139

Returns:

140

Array of changes.

141

"""

142

if not text1:

143

# Just add some text (speedup)

144

return [(self.DIFF_INSERT, text2)]

145

146

if not text2:

147

# Just delete some text (speedup)

148

return [(self.DIFF_DELETE, text1)]

149

150

if len(text1) > len(text2):

151

(longtext, shorttext) = (text1, text2)

152

else:

153

(shorttext, longtext) = (text1, text2)

154

i = longtext.find(shorttext)

155

if i != -1:

156

# Shorter text is inside the longer text (speedup)

157

diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),

158

(self.DIFF_INSERT, longtext[i + len(shorttext):])]

159

# Swap insertions for deletions if diff is reversed.

160

if len(text1) > len(text2):

161

diffs[0] = (self.DIFF_DELETE, diffs[0][1])

162

diffs[2] = (self.DIFF_DELETE, diffs[2][1])

163

return diffs

164

longtext = shorttext = None # Garbage collect.

165

166

# Check to see if the problem can be split in two.

167

hm = self.diff_halfMatch(text1, text2)

168

if hm:

169

# A half-match was found, sort out the return data.

170

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

171

# Send both pairs off for separate processing.

172

diffs_a = self.diff_main(text1_a, text2_a, checklines)

173

diffs_b = self.diff_main(text1_b, text2_b, checklines)

174

# Merge the results.

175

return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b

176

177

# Perform a real diff.

178

if checklines and (len(text1) < 100 or len(text2) < 100):

179

checklines = False # Too trivial for the overhead.

180

if checklines:

181

# Scan the text on a line-by-line basis first.

182

(text1, text2, linearray) = self.diff_linesToChars(text1, text2)

183

184

diffs = self.diff_map(text1, text2)

185

if not diffs: # No acceptable result.

186

diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]

187

if checklines:

188

# Convert the diff back to original text.

189

self.diff_charsToLines(diffs, linearray)

190

# Eliminate freak matches (e.g. blank lines)

191

self.diff_cleanupSemantic(diffs)

192

193

# Rediff any replacement blocks, this time character-by-character.

194

# Add a dummy entry at the end.

195

diffs.append((self.DIFF_EQUAL, ''))

196

pointer = 0

197

count_delete = 0

198

count_insert = 0

199

text_delete = ''

200

text_insert = ''

201

while pointer < len(diffs):

202

if diffs[pointer][0] == self.DIFF_INSERT:

203

count_insert += 1

204

text_insert += diffs[pointer][1]

205

elif diffs[pointer][0] == self.DIFF_DELETE:

206

count_delete += 1

207

text_delete += diffs[pointer][1]

208

elif diffs[pointer][0] == self.DIFF_EQUAL:

209

# Upon reaching an equality, check for prior redundancies.

210

if count_delete >= 1 and count_insert >= 1:

211

# Delete the offending records and add the merged ones.

212

a = self.diff_main(text_delete, text_insert, False)

213

diffs[pointer - count_delete - count_insert : pointer] = a

214

pointer = pointer - count_delete - count_insert + len(a)

215

count_insert = 0

216

count_delete = 0

217

text_delete = ''

218

text_insert = ''

219

220

pointer += 1

221

222

diffs.pop() # Remove the dummy entry at the end.

223

return diffs

224

225

def diff_linesToChars(self, text1, text2):

226

"""Split two texts into an array of strings. Reduce the texts to a string

227

of hashes where each Unicode character represents one line.

228

229

Args:

230

text1: First string.

231

text2: Second string.

232

233

Returns:

234

Three element tuple, containing the encoded text1, the encoded text2 and

235

the array of unique strings. The zeroth element of the array of unique

236

strings is intentionally blank.

237

"""

238

lineArray = [] # e.g. lineArray[4] == "Hello\n"

239

lineHash = {} # e.g. lineHash["Hello\n"] == 4

240

241

# "\x00" is a valid character, but various debuggers don't like it.

242

# So we'll insert a junk entry to avoid generating a null character.

243

lineArray.append('')

244

245

def diff_linesToCharsMunge(text):

246

"""Split a text into an array of strings. Reduce the texts to a string

247

of hashes where each Unicode character represents one line.

248

Modifies linearray and linehash through being a closure.

249

250

Args:

251

text: String to encode.

252

253

Returns:

254

Encoded string.

255

"""

256

chars = []

257

# Walk the text, pulling out a substring for each line.

258

# text.split('\n') would would temporarily double our memory footprint.

259

# Modifying text would create many large strings to garbage collect.

260

lineStart = 0

261

lineEnd = -1

262

while lineEnd < len(text) - 1:

263

lineEnd = text.find('\n', lineStart)

264

if lineEnd == -1:

265

lineEnd = len(text) - 1

266

line = text[lineStart:lineEnd + 1]

267

lineStart = lineEnd + 1

268

269

if line in lineHash:

270

chars.append(unichr(lineHash[line]))

271

else:

272

lineArray.append(line)

273

lineHash[line] = len(lineArray) - 1

274

chars.append(unichr(len(lineArray) - 1))

275

return "".join(chars)

276

277

chars1 = diff_linesToCharsMunge(text1)

278

chars2 = diff_linesToCharsMunge(text2)

279

return (chars1, chars2, lineArray)

280

281

def diff_charsToLines(self, diffs, lineArray):

282

"""Rehydrate the text in a diff from a string of line hashes to real lines

283

of text.

284

285

Args:

286

diffs: Array of diff tuples.

287

lineArray: Array of unique strings.

288

"""

289

for x in xrange(len(diffs)):

290

text = []

291

for char in diffs[x][1]:

292

text.append(lineArray[ord(char)])

293

diffs[x] = (diffs[x][0], "".join(text))

294

295

def diff_map(self, text1, text2):

296

"""Explore the intersection points between the two texts.

297

298

Args:

299

text1: Old string to be diffed.

300

text2: New string to be diffed.

301

302

Returns:

303

Array of diff tuples or None if no diff available.

304

"""

305

306

# Unlike in most languages, Python counts time in seconds.

307

s_end = time.time() + self.Diff_Timeout # Don't run for too long.

308

# Cache the text lengths to prevent multiple calls.

309

text1_length = len(text1)

310

text2_length = len(text2)

311

max_d = text1_length + text2_length - 1

312

doubleEnd = self.Diff_DualThreshold * 2 < max_d

313

v_map1 = []

314

v_map2 = []

315

v1 = {}

316

v2 = {}

317

v1[1] = 0

318

v2[1] = 0

319

footsteps = {}

320

done = False

321

# If the total number of characters is odd, then the front path will

322

# collide with the reverse path.

323

front = (text1_length + text2_length) % 2

324

for d in xrange(max_d):

325

# Bail out if timeout reached.

326

if self.Diff_Timeout > 0 and time.time() > s_end:

327

return None

328

329

# Walk the front path one step.

330

v_map1.append({})

331

for k in xrange(-d, d + 1, 2):

332

if k == -d or k != d and v1[k - 1] < v1[k + 1]:

333

x = v1[k + 1]

334

else:

335

x = v1[k - 1] + 1

336

y = x - k

337

if doubleEnd:

338

footstep = (x, y)

339

if front and footstep in footsteps:

340

done = True

341

if not front:

342

footsteps[footstep] = d

343

344

while (not done and x < text1_length and y < text2_length and

345

text1[x] == text2[y]):

346

x += 1

347

y += 1

348

if doubleEnd:

349

footstep = (x, y)

350

if front and footstep in footsteps:

351

done = True

352

if not front:

353

footsteps[footstep] = d

354

355

v1[k] = x

356

v_map1[d][(x, y)] = True

357

if x == text1_length and y == text2_length:

358

# Reached the end in single-path mode.

359

return self.diff_path1(v_map1, text1, text2)

360

elif done:

361

# Front path ran over reverse path.

362

v_map2 = v_map2[:footsteps[footstep] + 1]

363

a = self.diff_path1(v_map1, text1[:x], text2[:y])

364

b = self.diff_path2(v_map2, text1[x:], text2[y:])

365

return a + b

366

367

if doubleEnd:

368

# Walk the reverse path one step.

369

v_map2.append({})

370

for k in xrange(-d, d + 1, 2):

371

if k == -d or k != d and v2[k - 1] < v2[k + 1]:

372

x = v2[k + 1]

373

else:

374

x = v2[k - 1] + 1

375

y = x - k

376

footstep = (text1_length - x, text2_length - y)

377

if not front and footstep in footsteps:

378

done = True

379

if front:

380

footsteps[footstep] = d

381

while (not done and x < text1_length and y < text2_length and

382

text1[-x - 1] == text2[-y - 1]):

383

x += 1

384

y += 1

385

footstep = (text1_length - x, text2_length - y)

386

if not front and footstep in footsteps:

387

done = True

388

if front:

389

footsteps[footstep] = d

390

391

v2[k] = x

392

v_map2[d][(x, y)] = True

393

if done:

394

# Reverse path ran over front path.

395

v_map1 = v_map1[:footsteps[footstep] + 1]

396

a = self.diff_path1(v_map1, text1[:text1_length - x],

397

text2[:text2_length - y])

398

b = self.diff_path2(v_map2, text1[text1_length - x:],

399

text2[text2_length - y:])

400

return a + b

401

402

# Number of diffs equals number of characters, no commonality at all.

403

return None

404

405

def diff_path1(self, v_map, text1, text2):

406

"""Work from the middle back to the start to determine the path.

407

408

Args:

409

v_map: Array of paths.

410

text1: Old string fragment to be diffed.

411

text2: New string fragment to be diffed.

412

413

Returns:

414

Array of diff tuples.

415

"""

416

path = []

417

x = len(text1)

418

y = len(text2)

419

last_op = None

420

for d in xrange(len(v_map) - 2, -1, -1):

421

while True:

422

if (x - 1, y) in v_map[d]:

423

x -= 1

424

if last_op == self.DIFF_DELETE:

425

path[0] = (self.DIFF_DELETE, text1[x] + path[0][1])

426

else:

427

path[:0] = [(self.DIFF_DELETE, text1[x])]

428

last_op = self.DIFF_DELETE

429

break

430

elif (x, y - 1) in v_map[d]:

431

y -= 1

432

if last_op == self.DIFF_INSERT:

433

path[0] = (self.DIFF_INSERT, text2[y] + path[0][1])

434

else:

435

path[:0] = [(self.DIFF_INSERT, text2[y])]

436

last_op = self.DIFF_INSERT

437

break

438

else:

439

x -= 1

440

y -= 1

441

assert text1[x] == text2[y], ("No diagonal. " +

442

"Can't happen. (diff_path1)")

443

if last_op == self.DIFF_EQUAL:

444

path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1])

445

else:

446

path[:0] = [(self.DIFF_EQUAL, text1[x])]

447

last_op = self.DIFF_EQUAL

448

return path

449

450

def diff_path2(self, v_map, text1, text2):

451

"""Work from the middle back to the end to determine the path.

452

453

Args:

454

v_map: Array of paths.

455

text1: Old string fragment to be diffed.

456

text2: New string fragment to be diffed.

457

458

Returns:

459

Array of diff tuples.

460

"""

461

path = []

462

x = len(text1)

463

y = len(text2)

464

last_op = None

465

for d in xrange(len(v_map) - 2, -1, -1):

466

while True:

467

if (x - 1, y) in v_map[d]:

468

x -= 1

469

if last_op == self.DIFF_DELETE:

470

path[-1] = (self.DIFF_DELETE, path[-1][1] + text1[-x - 1])

471

else:

472

path.append((self.DIFF_DELETE, text1[-x - 1]))

473

last_op = self.DIFF_DELETE

474

break

475

elif (x, y - 1) in v_map[d]:

476

y -= 1

477

if last_op == self.DIFF_INSERT:

478

path[-1] = (self.DIFF_INSERT, path[-1][1] + text2[-y - 1])

479

else:

480

path.append((self.DIFF_INSERT, text2[-y - 1]))

481

last_op = self.DIFF_INSERT

482

break

483

else:

484

x -= 1

485

y -= 1

486

assert text1[-x - 1] == text2[-y - 1], ("No diagonal. " +

487

"Can't happen. (diff_path2)")

488

if last_op == self.DIFF_EQUAL:

489

path[-1] = (self.DIFF_EQUAL, path[-1][1] + text1[-x - 1])

490

else:

491

path.append((self.DIFF_EQUAL, text1[-x - 1]))

492

last_op = self.DIFF_EQUAL

493

return path

494

495

def diff_commonPrefix(self, text1, text2):

496

"""Determine the common prefix of two strings.

497

498

Args:

499

text1: First string.

500

text2: Second string.

501

502

Returns:

503

The number of characters common to the start of each string.

504

"""

505

# Quick check for common null cases.

506

if not text1 or not text2 or text1[0] != text2[0]:

507

return 0

508

# Binary search.

509

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

510

pointermin = 0

511

pointermax = min(len(text1), len(text2))

512

pointermid = pointermax

513

pointerstart = 0

514

while pointermin < pointermid:

515

if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:

516

pointermin = pointermid

517

pointerstart = pointermin

518

else:

519

pointermax = pointermid

520

pointermid = int((pointermax - pointermin) / 2 + pointermin)

521

return pointermid

522

523

def diff_commonSuffix(self, text1, text2):

524

"""Determine the common suffix of two strings.

525

526

Args:

527

text1: First string.

528

text2: Second string.

529

530

Returns:

531

The number of characters common to the end of each string.

532

"""

533

# Quick check for common null cases.

534

if not text1 or not text2 or text1[-1] != text2[-1]:

535

return 0

536

# Binary search.

537

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

538

pointermin = 0

539

pointermax = min(len(text1), len(text2))

540

pointermid = pointermax

541

pointerend = 0

542

while pointermin < pointermid:

543

if (text1[-pointermid:len(text1) - pointerend] ==

544

text2[-pointermid:len(text2) - pointerend]):

545

pointermin = pointermid

546

pointerend = pointermin

547

else:

548

pointermax = pointermid

549

pointermid = int((pointermax - pointermin) / 2 + pointermin)

550

return pointermid

551

552

def diff_halfMatch(self, text1, text2):

553

"""Do the two texts share a substring which is at least half the length of

554

the longer text?

555

556

Args:

557

text1: First string.

558

text2: Second string.

559

560

Returns:

561

Five element Array, containing the prefix of text1, the suffix of text1,

562

the prefix of text2, the suffix of text2 and the common middle. Or None

563

if there was no match.

564

"""

565

if len(text1) > len(text2):

566

(longtext, shorttext) = (text1, text2)

567

else:

568

(shorttext, longtext) = (text1, text2)

569

if len(longtext) < 10 or len(shorttext) < 1:

570

return None # Pointless.

571

572

def diff_halfMatchI(longtext, shorttext, i):

573

"""Does a substring of shorttext exist within longtext such that the

574

substring is at least half the length of longtext?

575

Closure, but does not reference any external variables.

576

577

Args:

578

longtext: Longer string.

579

shorttext: Shorter string.

580

i: Start index of quarter length substring within longtext.

581

582

Returns:

583

Five element Array, containing the prefix of longtext, the suffix of

584

longtext, the prefix of shorttext, the suffix of shorttext and the

585

common middle. Or None if there was no match.

586

"""

587

seed = longtext[i:i + len(longtext) / 4]

588

best_common = ''

589

j = shorttext.find(seed)

590

while j != -1:

591

prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])

592

suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])

593

if len(best_common) < suffixLength + prefixLength:

594

best_common = (shorttext[j - suffixLength:j] +

595

shorttext[j:j + prefixLength])

596

best_longtext_a = longtext[:i - suffixLength]

597

best_longtext_b = longtext[i + prefixLength:]

598

best_shorttext_a = shorttext[:j - suffixLength]

599

best_shorttext_b = shorttext[j + prefixLength:]

600

j = shorttext.find(seed, j + 1)

601

602

if len(best_common) >= len(longtext) / 2:

603

return (best_longtext_a, best_longtext_b,

604

best_shorttext_a, best_shorttext_b, best_common)

605

else:

606

return None

607

608

# First check if the second quarter is the seed for a half-match.

609

hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)

610

# Check again based on the third quarter.

611

hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)

612

if not hm1 and not hm2:

613

return None

614

elif not hm2:

615

hm = hm1

616

elif not hm1:

617

hm = hm2

618

else:

619

# Both matched. Select the longest.

620

if len(hm1[4]) > len(hm2[4]):

621

hm = hm1

622

else:

623

hm = hm2

624

625

# A half-match was found, sort out the return data.

626

if len(text1) > len(text2):

627

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

628

else:

629

(text2_a, text2_b, text1_a, text1_b, mid_common) = hm

630

return (text1_a, text1_b, text2_a, text2_b, mid_common)

631

632

def diff_cleanupSemantic(self, diffs):

633

"""Reduce the number of edits by eliminating semantically trivial

634

equalities.

635

636

Args:

637

diffs: Array of diff tuples.

638

"""

639

changes = False

640

equalities = [] # Stack of indices where equalities are found.

641

lastequality = None # Always equal to equalities[-1][1]

642

pointer = 0 # Index of current position.

643

length_changes1 = 0 # Number of chars that changed prior to the equality.

644

length_changes2 = 0 # Number of chars that changed after the equality.

645

while pointer < len(diffs):

646

if diffs[pointer][0] == self.DIFF_EQUAL: # equality found

647

equalities.append(pointer)

648

length_changes1 = length_changes2

649

length_changes2 = 0

650

lastequality = diffs[pointer][1]

651

else: # an insertion or deletion

652

length_changes2 += len(diffs[pointer][1])

653

if (lastequality != None and (len(lastequality) <= length_changes1) and

654

(len(lastequality) <= length_changes2)):

655

# Duplicate record

656

diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))

657

# Change second copy to insert.

658

diffs[equalities[-1] + 1] = (self.DIFF_INSERT,

659

diffs[equalities[-1] + 1][1])

660

# Throw away the equality we just deleted.

661

equalities.pop()

662

# Throw away the previous equality (it needs to be reevaluated).

663

if len(equalities) != 0:

664

equalities.pop()

665

if len(equalities):

666

pointer = equalities[-1]

667

else:

668

pointer = -1

669

length_changes1 = 0 # Reset the counters.

670

length_changes2 = 0

671

lastequality = None

672

changes = True

673

pointer += 1

674

675

if changes:

676

self.diff_cleanupMerge(diffs)

677

678

self.diff_cleanupSemanticLossless(diffs)

679

680

def diff_cleanupSemanticLossless(self, diffs):

681

"""Look for single edits surrounded on both sides by equalities

682

which can be shifted sideways to align the edit to a word boundary.

683

e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.

684

685

Args:

686

diffs: Array of diff tuples.

687

"""

688

689

def diff_cleanupSemanticScore(one, two):

690

"""Given two strings, compute a score representing whether the

691

internal boundary falls on logical boundaries.

692

Scores range from 5 (best) to 0 (worst).

693

Closure, but does not reference any external variables.

694

695

Args:

696

one: First string.

697

two: Second string.

698

699

Returns:

700

The score.

701

"""

702

if not one or not two:

703

# Edges are the best.

704

return 5

705

706

# Each port of this function behaves slightly differently due to

707

# subtle differences in each language's definition of things like

708

# 'whitespace'. Since this function's purpose is largely cosmetic,

709

# the choice has been made to use each language's native features

710

# rather than force total conformity.

711

score = 0

712

# One point for non-alphanumeric.

713

if not one[-1].isalnum() or not two[0].isalnum():

714

score += 1

715

# Two points for whitespace.

716

if one[-1].isspace() or two[0].isspace():

717

score += 1

718

# Three points for line breaks.

719

if (one[-1] == "\r" or one[-1] == "\n" or

720

two[0] == "\r" or two[0] == "\n"):

721

score += 1

722

# Four points for blank lines.

723

if (re.search("\\n\\r?\\n$", one) or

724

re.match("^\\r?\\n\\r?\\n", two)):

725

score += 1

726

return score

727

728

pointer = 1

729

# Intentionally ignore the first and last element (don't need checking).

730

while pointer < len(diffs) - 1:

731

if (diffs[pointer - 1][0] == self.DIFF_EQUAL and

732

diffs[pointer + 1][0] == self.DIFF_EQUAL):

733

# This is a single edit surrounded by equalities.

734

equality1 = diffs[pointer - 1][1]

735

edit = diffs[pointer][1]

736

equality2 = diffs[pointer + 1][1]

737

738

# First, shift the edit as far left as possible.

739

commonOffset = self.diff_commonSuffix(equality1, edit)

740

if commonOffset:

741

commonString = edit[-commonOffset:]

742

equality1 = equality1[:-commonOffset]

743

edit = commonString + edit[:-commonOffset]

744

equality2 = commonString + equality2

745

746

# Second, step character by character right, looking for the best fit.

747

bestEquality1 = equality1

748

bestEdit = edit

749

bestEquality2 = equality2

750

bestScore = (diff_cleanupSemanticScore(equality1, edit) +

751

diff_cleanupSemanticScore(edit, equality2))

752

while edit and equality2 and edit[0] == equality2[0]:

753

equality1 += edit[0]

754

edit = edit[1:] + equality2[0]

755

equality2 = equality2[1:]

756

score = (diff_cleanupSemanticScore(equality1, edit) +

757

diff_cleanupSemanticScore(edit, equality2))

758

# The >= encourages trailing rather than leading whitespace on edits.

759

if score >= bestScore:

760

bestScore = score

761

bestEquality1 = equality1

762

bestEdit = edit

763

bestEquality2 = equality2

764

765

if diffs[pointer - 1][1] != bestEquality1:

766

# We have an improvement, save it back to the diff.

767

if bestEquality1:

768

diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)

769

else:

770

del diffs[pointer - 1]

771

pointer -= 1

772

diffs[pointer] = (diffs[pointer][0], bestEdit)

773

if bestEquality2:

774

diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)

775

else:

776

del diffs[pointer + 1]

777

pointer -= 1

778

pointer += 1

779

780

def diff_cleanupEfficiency(self, diffs):

781

"""Reduce the number of edits by eliminating operationally trivial

782

equalities.

783

784

Args:

785

diffs: Array of diff tuples.

786

"""

787

changes = False

788

equalities = [] # Stack of indices where equalities are found.

789

lastequality = '' # Always equal to equalities[-1][1]

790

pointer = 0 # Index of current position.

791

pre_ins = False # Is there an insertion operation before the last equality.

792

pre_del = False # Is there a deletion operation before the last equality.

793

post_ins = False # Is there an insertion operation after the last equality.

794

post_del = False # Is there a deletion operation after the last equality.

795

while pointer < len(diffs):

796

if diffs[pointer][0] == self.DIFF_EQUAL: # equality found

797

if (len(diffs[pointer][1]) < self.Diff_EditCost and

798

(post_ins or post_del)):

799

# Candidate found.

800

equalities.append(pointer)

801

pre_ins = post_ins

802

pre_del = post_del

803

lastequality = diffs[pointer][1]

804

else:

805

# Not a candidate, and can never become one.

806

equalities = []

807

lastequality = ''

808

809

post_ins = post_del = False

810

else: # an insertion or deletion

811

if diffs[pointer][0] == self.DIFF_DELETE:

812

post_del = True

813

else:

814

post_ins = True

815

816

# Five types to be split:

817

# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>

818

# <ins>A</ins>X<ins>C</ins><del>D</del>

819

# <ins>A</ins><del>B</del>X<ins>C</ins>

820

# <ins>A</del>X<ins>C</ins><del>D</del>

821

# <ins>A</ins><del>B</del>X<del>C</del>

822

823

if lastequality and ((pre_ins and pre_del and post_ins and post_del) or

824

((len(lastequality) < self.Diff_EditCost / 2) and

825

(pre_ins + pre_del + post_ins + post_del) == 3)):

826

# Duplicate record

827

diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality))

828

# Change second copy to insert.

829

diffs[equalities[-1] + 1] = (self.DIFF_INSERT,

830

diffs[equalities[-1] + 1][1])

831

equalities.pop() # Throw away the equality we just deleted

832

lastequality = ''

833

if pre_ins and pre_del:

834

# No changes made which could affect previous entry, keep going.

835

post_ins = post_del = True

836

equalities = []

837

else:

838

if len(equalities):

839

equalities.pop() # Throw away the previous equality

840

if len(equalities):

841

pointer = equalities[-1]

842

else:

843

pointer = -1

844

post_ins = post_del = False

845

changes = True

846

pointer += 1

847

848

if changes:

849

self.diff_cleanupMerge(diffs)

850

851

def diff_cleanupMerge(self, diffs):

852

"""Reorder and merge like edit sections. Merge equalities.

853

Any edit section can move as long as it doesn't cross an equality.

854

855

Args:

856

diffs: Array of diff tuples.

857

"""

858

diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.

859

pointer = 0

860

count_delete = 0

861

count_insert = 0

862

text_delete = ''

863

text_insert = ''

864

while pointer < len(diffs):

865

if diffs[pointer][0] == self.DIFF_INSERT:

866

count_insert += 1

867

text_insert += diffs[pointer][1]

868

pointer += 1

869

elif diffs[pointer][0] == self.DIFF_DELETE:

870

count_delete += 1

871

text_delete += diffs[pointer][1]

872

pointer += 1

873

elif diffs[pointer][0] == self.DIFF_EQUAL:

874

# Upon reaching an equality, check for prior redundancies.

875

if count_delete != 0 or count_insert != 0:

876

if count_delete != 0 and count_insert != 0:

877

# Factor out any common prefixies.

878

commonlength = self.diff_commonPrefix(text_insert, text_delete)

879

if commonlength != 0:

880

x = pointer - count_delete - count_insert - 1

881

if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:

882

diffs[x] = (diffs[x][0], diffs[x][1] +

883

text_insert[:commonlength])

884

else:

885

diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength]))

39

"""Class containing the diff, match and patch methods.

40

41

Also contains the behaviour settings.

42

43

"""

44

45

def __init__(self):

46

"""Inits a diff_match_patch object with default settings.

47

48

Redefine these in your program to override the defaults.

49

50

"""

51

52

# Number of seconds to map a diff before giving up (0 for infinity).

53

self.Diff_Timeout = 1.0

54

# Cost of an empty edit operation in terms of edit characters.

55

self.Diff_EditCost = 4

56

# The size beyond which the double-ended diff activates.

57

# Double-ending is twice as fast, but less accurate.

58

self.Diff_DualThreshold = 32

59

# At what point is no match declared (0.0 = perfection, 1.0 = very

60

# loose).

61

self.Match_Threshold = 0.5

62

# How far to search for a match (0 = exact location, 1000+ = broad match).

63

# A match this many characters away from the expected location will add

64

# 1.0 to the score (0.0 is a perfect match).

65

self.Match_Distance = 1000

66

# When deleting a large block of text (over ~64 characters), how close does

67

# the contents have to match the expected contents. (0.0 = perfection,

68

# 1.0 = very loose). Note that Match_Threshold controls how closely the

69

# end points of a delete need to match.

70

self.Patch_DeleteThreshold = 0.5

71

# Chunk size for context length.

72

self.Patch_Margin = 4

73

74

# How many bits in a number?

75

# Python has no maximum, thus to disable patch splitting set to 0.

76

# However to avoid long patches in certain pathological cases, use 32.

77

# Multiple short patches (using native ints) are much faster than long

78

# ones.

79

self.Match_MaxBits = 32

80

81

# DIFF FUNCTIONS

82

83

# The data structure representing a diff is an array of tuples:

84

# [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]

85

# which means: delete "Hello", add "Goodbye" and keep " world."

86

DIFF_DELETE = -1

87

DIFF_INSERT = 1

88

DIFF_EQUAL = 0

89

90

def diff_main(self, text1, text2, checklines=True):

91

"""Find the differences between two texts. Simplifies the problem by

92

stripping any common prefix or suffix off the texts before diffing.

93

94

Args:

95

text1: Old string to be diffed.

96

text2: New string to be diffed.

97

checklines: Optional speedup flag. If present and false, then don't run

98

a line-level diff first to identify the changed areas.

99

Defaults to true, which does a faster, slightly less optimal diff.

100

101

Returns:

102

Array of changes.

103

104

"""

105

106

# Check for equality (speedup)

107

if text1 == text2:

108

return [(self.DIFF_EQUAL, text1)]

109

110

# Trim off common prefix (speedup)

111

commonlength = self.diff_commonPrefix(text1, text2)

112

commonprefix = text1[:commonlength]

113

text1 = text1[commonlength:]

114

text2 = text2[commonlength:]

115

116

# Trim off common suffix (speedup)

117

commonlength = self.diff_commonSuffix(text1, text2)

118

if commonlength == 0:

119

commonsuffix = ''

120

else:

121

commonsuffix = text1[-commonlength:]

122

text1 = text1[:-commonlength]

123

text2 = text2[:-commonlength]

124

125

# Compute the diff on the middle block

126

diffs = self.diff_compute(text1, text2, checklines)

127

128

# Restore the prefix and suffix

129

if commonprefix:

130

diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]

131

if commonsuffix:

132

diffs.append((self.DIFF_EQUAL, commonsuffix))

133

self.diff_cleanupMerge(diffs)

134

return diffs

135

136

def diff_compute(self, text1, text2, checklines):

137

"""Find the differences between two texts. Assumes that the texts do

138

not have any common prefix or suffix.

139

140

Args:

141

text1: Old string to be diffed.

142

text2: New string to be diffed.

143

checklines: Speedup flag. If false, then don't run a line-level diff

144

first to identify the changed areas.

145

If true, then run a faster, slightly less optimal diff.

146

147

Returns:

148

Array of changes.

149

150

"""

151

if not text1:

152

# Just add some text (speedup)

153

return [(self.DIFF_INSERT, text2)]

154

155

if not text2:

156

# Just delete some text (speedup)

157

return [(self.DIFF_DELETE, text1)]

158

159

if len(text1) > len(text2):

160

(longtext, shorttext) = (text1, text2)

161

else:

162

(shorttext, longtext) = (text1, text2)

163

i = longtext.find(shorttext)

164

if i != -1:

165

# Shorter text is inside the longer text (speedup)

166

diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext),

167

(self.DIFF_INSERT, longtext[i + len(shorttext):])]

168

# Swap insertions for deletions if diff is reversed.

169

if len(text1) > len(text2):

170

diffs[0] = (self.DIFF_DELETE, diffs[0][1])

171

diffs[2] = (self.DIFF_DELETE, diffs[2][1])

172

return diffs

173

longtext = shorttext = None # Garbage collect.

174

175

# Check to see if the problem can be split in two.

176

hm = self.diff_halfMatch(text1, text2)

177

if hm:

178

# A half-match was found, sort out the return data.

179

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

180

# Send both pairs off for separate processing.

181

diffs_a = self.diff_main(text1_a, text2_a, checklines)

182

diffs_b = self.diff_main(text1_b, text2_b, checklines)

183

# Merge the results.

184

return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b

185

186

# Perform a real diff.

187

if checklines and (len(text1) < 100 or len(text2) < 100):

188

checklines = False # Too trivial for the overhead.

189

if checklines:

190

# Scan the text on a line-by-line basis first.

191

(text1, text2, linearray) = self.diff_linesToChars(text1, text2)

192

193

diffs = self.diff_map(text1, text2)

194

if not diffs: # No acceptable result.

195

diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]

196

if checklines:

197

# Convert the diff back to original text.

198

self.diff_charsToLines(diffs, linearray)

199

# Eliminate freak matches (e.g. blank lines)

200

self.diff_cleanupSemantic(diffs)

201

202

# Rediff any replacement blocks, this time character-by-character.

203

# Add a dummy entry at the end.

204

diffs.append((self.DIFF_EQUAL, ''))

205

pointer = 0

206

count_delete = 0

207

count_insert = 0

208

text_delete = ''

209

text_insert = ''

210

while pointer < len(diffs):

211

if diffs[pointer][0] == self.DIFF_INSERT:

212

count_insert += 1

213

text_insert += diffs[pointer][1]

214

elif diffs[pointer][0] == self.DIFF_DELETE:

215

count_delete += 1

216

text_delete += diffs[pointer][1]

217

elif diffs[pointer][0] == self.DIFF_EQUAL:

218

# Upon reaching an equality, check for prior redundancies.

219

if count_delete >= 1 and count_insert >= 1:

220

# Delete the offending records and add the merged ones.

221

a = self.diff_main(text_delete, text_insert, False)

222

diffs[pointer - count_delete -

223

count_insert: pointer] = a

224

pointer = pointer - count_delete - \

225

count_insert + len(a)

226

count_insert = 0

227

count_delete = 0

228

text_delete = ''

229

text_insert = ''

230

886

231

pointer += 1

887

text_insert = text_insert[commonlength:]

888

text_delete = text_delete[commonlength:]

889

# Factor out any common suffixies.

890

commonlength = self.diff_commonSuffix(text_insert, text_delete)

891

if commonlength != 0:

892

diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +

893

diffs[pointer][1])

894

text_insert = text_insert[:-commonlength]

895

text_delete = text_delete[:-commonlength]

896

# Delete the offending records and add the merged ones.

897

if count_delete == 0:

898

diffs[pointer - count_insert : pointer] = [

899

(self.DIFF_INSERT, text_insert)]

900

elif count_insert == 0:

901

diffs[pointer - count_delete : pointer] = [

902

(self.DIFF_DELETE, text_delete)]

903

else:

904

diffs[pointer - count_delete - count_insert : pointer] = [

905

(self.DIFF_DELETE, text_delete),

906

(self.DIFF_INSERT, text_insert)]

907

pointer = pointer - count_delete - count_insert + 1

908

if count_delete != 0:

909

pointer += 1

910

if count_insert != 0:

911

pointer += 1

912

elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:

913

# Merge this equality with the previous one.

914

diffs[pointer - 1] = (diffs[pointer - 1][0],

915

diffs[pointer - 1][1] + diffs[pointer][1])

916

del diffs[pointer]

917

else:

918

pointer += 1

919

232

233

diffs.pop() # Remove the dummy entry at the end.

234

return diffs

235

236

def diff_linesToChars(self, text1, text2):

237

"""Split two texts into an array of strings. Reduce the texts to a

238

string of hashes where each Unicode character represents one line.

239

240

Args:

241

text1: First string.

242

text2: Second string.

243

244

Returns:

245

Three element tuple, containing the encoded text1, the encoded text2 and

246

the array of unique strings. The zeroth element of the array of unique

247

strings is intentionally blank.

248

249

"""

250

lineArray = [] # e.g. lineArray[4] == "Hello\n"

251

lineHash = {} # e.g. lineHash["Hello\n"] == 4

252

253

# "\x00" is a valid character, but various debuggers don't like it.

254

# So we'll insert a junk entry to avoid generating a null character.

255

lineArray.append('')

256

257

def diff_linesToCharsMunge(text):

258

"""Split a text into an array of strings. Reduce the texts to a

259

string of hashes where each Unicode character represents one line.

260

Modifies linearray and linehash through being a closure.

261

262

Args:

263

text: String to encode.

264

265

Returns:

266

Encoded string.

267

268

"""

269

chars = []

270

# Walk the text, pulling out a substring for each line.

271

# text.split('\n') would would temporarily double our memory footprint.

272

# Modifying text would create many large strings to garbage

273

# collect.

274

lineStart = 0

275

lineEnd = -1

276

while lineEnd < len(text) - 1:

277

lineEnd = text.find('\n', lineStart)

278

if lineEnd == -1:

279

lineEnd = len(text) - 1

280

line = text[lineStart:lineEnd + 1]

281

lineStart = lineEnd + 1

282

283

if line in lineHash:

284

chars.append(unichr(lineHash[line]))

285

else:

286

lineArray.append(line)

287

lineHash[line] = len(lineArray) - 1

288

chars.append(unichr(len(lineArray) - 1))

289

return ''.join(chars)

290

291

chars1 = diff_linesToCharsMunge(text1)

292

chars2 = diff_linesToCharsMunge(text2)

293

return (chars1, chars2, lineArray)

294

295

def diff_charsToLines(self, diffs, lineArray):

296

"""Rehydrate the text in a diff from a string of line hashes to real

297

lines of text.

298

299

Args:

300

diffs: Array of diff tuples.

301

lineArray: Array of unique strings.

302

303

"""

304

for x in xrange(len(diffs)):

305

text = []

306

for char in diffs[x][1]:

307

text.append(lineArray[ord(char)])

308

diffs[x] = (diffs[x][0], ''.join(text))

309

310

def diff_map(self, text1, text2):

311

"""Explore the intersection points between the two texts.

312

313

Args:

314

text1: Old string to be diffed.

315

text2: New string to be diffed.

316

317

Returns:

318

Array of diff tuples or None if no diff available.

319

320

"""

321

322

# Unlike in most languages, Python counts time in seconds.

323

s_end = time.time() + self.Diff_Timeout # Don't run for too long.

324

# Cache the text lengths to prevent multiple calls.

325

text1_length = len(text1)

326

text2_length = len(text2)

327

max_d = text1_length + text2_length - 1

328

doubleEnd = self.Diff_DualThreshold * 2 < max_d

329

v_map1 = []

330

v_map2 = []

331

v1 = {}

332

v2 = {}

333

v1[1] = 0

334

v2[1] = 0

335

footsteps = {}

336

done = False

337

# If the total number of characters is odd, then the front path will

338

# collide with the reverse path.

339

front = (text1_length + text2_length) % 2

340

for d in xrange(max_d):

341

# Bail out if timeout reached.

342

if self.Diff_Timeout > 0 and time.time() > s_end:

343

return None

344

345

# Walk the front path one step.

346

v_map1.append({})

347

for k in xrange(-d, d + 1, 2):

348

if k == -d or k != d and v1[k - 1] < v1[k + 1]:

349

x = v1[k + 1]

350

else:

351

x = v1[k - 1] + 1

352

y = x - k

353

if doubleEnd:

354

footstep = (x, y)

355

if front and footstep in footsteps:

356

done = True

357

if not front:

358

footsteps[footstep] = d

359

360

while (not done and x < text1_length and y < text2_length and

361

text1[x] == text2[y]):

362

x += 1

363

y += 1

364

if doubleEnd:

365

footstep = (x, y)

366

if front and footstep in footsteps:

367

done = True

368

if not front:

369

footsteps[footstep] = d

370

371

v1[k] = x

372

v_map1[d][(x, y)] = True

373

if x == text1_length and y == text2_length:

374

# Reached the end in single-path mode.

375

return self.diff_path1(v_map1, text1, text2)

376

elif done:

377

# Front path ran over reverse path.

378

v_map2 = v_map2[:footsteps[footstep] + 1]

379

a = self.diff_path1(v_map1, text1[:x], text2[:y])

380

b = self.diff_path2(v_map2, text1[x:], text2[y:])

381

return a + b

382

383

if doubleEnd:

384

# Walk the reverse path one step.

385

v_map2.append({})

386

for k in xrange(-d, d + 1, 2):

387

if k == -d or k != d and v2[k - 1] < v2[k + 1]:

388

x = v2[k + 1]

389

else:

390

x = v2[k - 1] + 1

391

y = x - k

392

footstep = (text1_length - x, text2_length - y)

393

if not front and footstep in footsteps:

394

done = True

395

if front:

396

footsteps[footstep] = d

397

while (not done and x < text1_length and y < text2_length and

398

text1[-x - 1] == text2[-y - 1]):

399

x += 1

400

y += 1

401

footstep = (text1_length - x, text2_length - y)

402

if not front and footstep in footsteps:

403

done = True

404

if front:

405

footsteps[footstep] = d

406

407

v2[k] = x

408

v_map2[d][(x, y)] = True

409

if done:

410

# Reverse path ran over front path.

411

v_map1 = v_map1[:footsteps[footstep] + 1]

412

a = self.diff_path1(v_map1, text1[:text1_length - x],

413

text2[:text2_length - y])

414

b = self.diff_path2(v_map2, text1[text1_length - x:],

415

text2[text2_length - y:])

416

return a + b

417

418

# Number of diffs equals number of characters, no commonality at all.

419

return None

420

421

def diff_path1(self, v_map, text1, text2):

422

"""Work from the middle back to the start to determine the path.

423

424

Args:

425

v_map: Array of paths.

426

text1: Old string fragment to be diffed.

427

text2: New string fragment to be diffed.

428

429

Returns:

430

Array of diff tuples.

431

432

"""

433

path = []

434

x = len(text1)

435

y = len(text2)

436

last_op = None

437

for d in xrange(len(v_map) - 2, -1, -1):

438

while True:

439

if (x - 1, y) in v_map[d]:

440

x -= 1

441

if last_op == self.DIFF_DELETE:

442

path[0] = (self.DIFF_DELETE, text1[x] + path[0][1])

443

else:

444

path[:0] = [(self.DIFF_DELETE, text1[x])]

445

last_op = self.DIFF_DELETE

446

break

447

elif (x, y - 1) in v_map[d]:

448

y -= 1

449

if last_op == self.DIFF_INSERT:

450

path[0] = (self.DIFF_INSERT, text2[y] + path[0][1])

451

else:

452

path[:0] = [(self.DIFF_INSERT, text2[y])]

453

last_op = self.DIFF_INSERT

454

break

455

else:

456

x -= 1

457

y -= 1

458

assert text1[x] == text2[y], ('No diagonal. ' +

459

"Can't happen. (diff_path1)")

460

if last_op == self.DIFF_EQUAL:

461

path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1])

462

else:

463

path[:0] = [(self.DIFF_EQUAL, text1[x])]

464

last_op = self.DIFF_EQUAL

465

return path

466

467

def diff_path2(self, v_map, text1, text2):

468

"""Work from the middle back to the end to determine the path.

469

470

Args:

471

v_map: Array of paths.

472

text1: Old string fragment to be diffed.

473

text2: New string fragment to be diffed.

474

475

Returns:

476

Array of diff tuples.

477

478

"""

479

path = []

480

x = len(text1)

481

y = len(text2)

482

last_op = None

483

for d in xrange(len(v_map) - 2, -1, -1):

484

while True:

485

if (x - 1, y) in v_map[d]:

486

x -= 1

487

if last_op == self.DIFF_DELETE:

488

path[-1] = (self.DIFF_DELETE, path[-1]

489

[1] + text1[-x - 1])

490

else:

491

path.append((self.DIFF_DELETE, text1[-x - 1]))

492

last_op = self.DIFF_DELETE

493

break

494

elif (x, y - 1) in v_map[d]:

495

y -= 1

496

if last_op == self.DIFF_INSERT:

497

path[-1] = (self.DIFF_INSERT, path[-1]

498

[1] + text2[-y - 1])

499

else:

500

path.append((self.DIFF_INSERT, text2[-y - 1]))

501

last_op = self.DIFF_INSERT

502

break

503

else:

504

x -= 1

505

y -= 1

506

assert text1[-x - 1] == text2[-y - 1], ('No diagonal. ' +

507

"Can't happen. (diff_path2)")

508

if last_op == self.DIFF_EQUAL:

509

path[-1] = (self.DIFF_EQUAL, path[-1]

510

[1] + text1[-x - 1])

511

else:

512

path.append((self.DIFF_EQUAL, text1[-x - 1]))

513

last_op = self.DIFF_EQUAL

514

return path

515

516

def diff_commonPrefix(self, text1, text2):

517

"""Determine the common prefix of two strings.

518

519

Args:

520

text1: First string.

521

text2: Second string.

522

523

Returns:

524

The number of characters common to the start of each string.

525

526

"""

527

# Quick check for common null cases.

528

if not text1 or not text2 or text1[0] != text2[0]:

529

return 0

530

# Binary search.

531

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

532

pointermin = 0

533

pointermax = min(len(text1), len(text2))

534

pointermid = pointermax

535

pointerstart = 0

536

while pointermin < pointermid:

537

if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:

538

pointermin = pointermid

539

pointerstart = pointermin

540

else:

541

pointermax = pointermid

542

pointermid = int((pointermax - pointermin) / 2 + pointermin)

543

return pointermid

544

545

def diff_commonSuffix(self, text1, text2):

546

"""Determine the common suffix of two strings.

547

548

Args:

549

text1: First string.

550

text2: Second string.

551

552

Returns:

553

The number of characters common to the end of each string.

554

555

"""

556

# Quick check for common null cases.

557

if not text1 or not text2 or text1[-1] != text2[-1]:

558

return 0

559

# Binary search.

560

# Performance analysis: http://neil.fraser.name/news/2007/10/09/

561

pointermin = 0

562

pointermax = min(len(text1), len(text2))

563

pointermid = pointermax

564

pointerend = 0

565

while pointermin < pointermid:

566

if (text1[-pointermid:len(text1) - pointerend] ==

567

text2[-pointermid:len(text2) - pointerend]):

568

pointermin = pointermid

569

pointerend = pointermin

570

else:

571

pointermax = pointermid

572

pointermid = int((pointermax - pointermin) / 2 + pointermin)

573

return pointermid

574

575

def diff_halfMatch(self, text1, text2):

576

"""Do the two texts share a substring which is at least half the length

577

of the longer text?

578

579

Args:

580

text1: First string.

581

text2: Second string.

582

583

Returns:

584

Five element Array, containing the prefix of text1, the suffix of text1,

585

the prefix of text2, the suffix of text2 and the common middle. Or None

586

if there was no match.

587

588

"""

589

if len(text1) > len(text2):

590

(longtext, shorttext) = (text1, text2)

591

else:

592

(shorttext, longtext) = (text1, text2)

593

if len(longtext) < 10 or len(shorttext) < 1:

594

return None # Pointless.

595

596

def diff_halfMatchI(longtext, shorttext, i):

597

"""Does a substring of shorttext exist within longtext such that

598

the substring is at least half the length of longtext? Closure, but

599

does not reference any external variables.

600

601

Args:

602

longtext: Longer string.

603

shorttext: Shorter string.

604

i: Start index of quarter length substring within longtext.

605

606

Returns:

607

Five element Array, containing the prefix of longtext, the suffix of

608

longtext, the prefix of shorttext, the suffix of shorttext and the

609

common middle. Or None if there was no match.

610

611

"""

612

seed = longtext[i:i + len(longtext) / 4]

613

best_common = ''

614

j = shorttext.find(seed)

615

while j != -1:

616

prefixLength = self.diff_commonPrefix(

617

longtext[i:], shorttext[j:])

618

suffixLength = self.diff_commonSuffix(

619

longtext[:i], shorttext[:j])

620

if len(best_common) < suffixLength + prefixLength:

621

best_common = (shorttext[j - suffixLength:j] +

622

shorttext[j:j + prefixLength])

623

best_longtext_a = longtext[:i - suffixLength]

624

best_longtext_b = longtext[i + prefixLength:]

625

best_shorttext_a = shorttext[:j - suffixLength]

626

best_shorttext_b = shorttext[j + prefixLength:]

627

j = shorttext.find(seed, j + 1)

628

629

if len(best_common) >= len(longtext) / 2:

630

return (best_longtext_a, best_longtext_b,

631

best_shorttext_a, best_shorttext_b, best_common)

632

else:

633

return None

634

635

# First check if the second quarter is the seed for a half-match.

636

hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4)

637

# Check again based on the third quarter.

638

hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2)

639

if not hm1 and not hm2:

640

return None

641

elif not hm2:

642

hm = hm1

643

elif not hm1:

644

hm = hm2

645

else:

646

# Both matched. Select the longest.

647

if len(hm1[4]) > len(hm2[4]):

648

hm = hm1

649

else:

650

hm = hm2

651

652

# A half-match was found, sort out the return data.

653

if len(text1) > len(text2):

654

(text1_a, text1_b, text2_a, text2_b, mid_common) = hm

655

else:

656

(text2_a, text2_b, text1_a, text1_b, mid_common) = hm

657

return (text1_a, text1_b, text2_a, text2_b, mid_common)

658

659

def diff_cleanupSemantic(self, diffs):

660

"""Reduce the number of edits by eliminating semantically trivial

661

equalities.

662

663

Args:

664

diffs: Array of diff tuples.

665

666

"""

667

changes = False

668

equalities = [] # Stack of indices where equalities are found.

669

lastequality = None # Always equal to equalities[-1][1]

670

pointer = 0 # Index of current position.

671

# Number of chars that changed prior to the equality.

672

length_changes1 = 0

673

length_changes2 = 0 # Number of chars that changed after the equality.

674

while pointer < len(diffs):

675

if diffs[pointer][0] == self.DIFF_EQUAL: # equality found

676

equalities.append(pointer)

677

length_changes1 = length_changes2

678

length_changes2 = 0

679

lastequality = diffs[pointer][1]

680

else: # an insertion or deletion

681

length_changes2 += len(diffs[pointer][1])

682

if (lastequality != None and (len(lastequality) <= length_changes1) and

683

(len(lastequality) <= length_changes2)):

684

# Duplicate record

685

diffs.insert(equalities[-1],

686

(self.DIFF_DELETE, lastequality))

687

# Change second copy to insert.

688

diffs[equalities[-1] + 1] = (self.DIFF_INSERT,

689

diffs[equalities[-1] + 1][1])

690

# Throw away the equality we just deleted.

691

equalities.pop()

692

# Throw away the previous equality (it needs to be

693

# reevaluated).

694

if len(equalities) != 0:

695

equalities.pop()

696

if len(equalities):

697

pointer = equalities[-1]

698

else:

699

pointer = -1

700

length_changes1 = 0 # Reset the counters.

701

length_changes2 = 0

702

lastequality = None

703

changes = True

704

pointer += 1

705

706

if changes:

707

self.diff_cleanupMerge(diffs)

708

709

self.diff_cleanupSemanticLossless(diffs)

710

711

def diff_cleanupSemanticLossless(self, diffs):

712

"""Look for single edits surrounded on both sides by equalities

713

which can be shifted sideways to align the edit to a word boundary.

714

e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.

715

716

Args:

717

diffs: Array of diff tuples.

718

"""

719

720

def diff_cleanupSemanticScore(one, two):

721

"""Given two strings, compute a score representing whether the

722

internal boundary falls on logical boundaries. Scores range from 5

723

(best) to 0 (worst). Closure, but does not reference any external

724

variables.

725

726

Args:

727

one: First string.

728

two: Second string.

729

730

Returns:

731

The score.

732

733

"""

734

if not one or not two:

735

# Edges are the best.

736

return 5

737

738

# Each port of this function behaves slightly differently due to

739

# subtle differences in each language's definition of things like

740

# 'whitespace'. Since this function's purpose is largely cosmetic,

741

# the choice has been made to use each language's native features

742

# rather than force total conformity.

743

score = 0

744

# One point for non-alphanumeric.

745

if not one[-1].isalnum() or not two[0].isalnum():

746

score += 1

747

# Two points for whitespace.

748

if one[-1].isspace() or two[0].isspace():

749

score += 1

750

# Three points for line breaks.

751

if (one[-1] == '\r' or one[-1] == '\n' or

752

two[0] == '\r' or two[0] == '\n'):

753

score += 1

754

# Four points for blank lines.

755

if (re.search('\\n\\r?\\n$', one) or

756

re.match('^\\r?\\n\\r?\\n', two)):

757

score += 1

758

return score

759

760

pointer = 1

761

# Intentionally ignore the first and last element (don't need

762

# checking).

763

while pointer < len(diffs) - 1:

764

if (diffs[pointer - 1][0] == self.DIFF_EQUAL and

765

diffs[pointer + 1][0] == self.DIFF_EQUAL):

766

# This is a single edit surrounded by equalities.

767

equality1 = diffs[pointer - 1][1]

768

edit = diffs[pointer][1]

769

equality2 = diffs[pointer + 1][1]

770

771

# First, shift the edit as far left as possible.

772

commonOffset = self.diff_commonSuffix(equality1, edit)

773

if commonOffset:

774

commonString = edit[-commonOffset:]

775

equality1 = equality1[:-commonOffset]

776

edit = commonString + edit[:-commonOffset]

777

equality2 = commonString + equality2

778

779

# Second, step character by character right, looking for the

780

# best fit.

781

bestEquality1 = equality1

782

bestEdit = edit

783

bestEquality2 = equality2

784

bestScore = (diff_cleanupSemanticScore(equality1, edit) +

785

diff_cleanupSemanticScore(edit, equality2))

786

while edit and equality2 and edit[0] == equality2[0]:

787

equality1 += edit[0]

788

edit = edit[1:] + equality2[0]

789

equality2 = equality2[1:]

790

score = (diff_cleanupSemanticScore(equality1, edit) +

791

diff_cleanupSemanticScore(edit, equality2))

792

# The >= encourages trailing rather than leading whitespace

793

# on edits.

794

if score >= bestScore:

795

bestScore = score

796

bestEquality1 = equality1

797

bestEdit = edit

798

bestEquality2 = equality2

799

800

if diffs[pointer - 1][1] != bestEquality1:

801

# We have an improvement, save it back to the diff.

802

if bestEquality1:

803

diffs[pointer - 1] = (diffs[pointer - 1]

804

[0], bestEquality1)

805

else:

806

del diffs[pointer - 1]

807

pointer -= 1

808

diffs[pointer] = (diffs[pointer][0], bestEdit)

809

if bestEquality2:

810

diffs[pointer + 1] = (diffs[pointer + 1]

811

[0], bestEquality2)

812

else:

813

del diffs[pointer + 1]

814

pointer -= 1

815

pointer += 1

816

817

def diff_cleanupEfficiency(self, diffs):

818

"""Reduce the number of edits by eliminating operationally trivial

819

equalities.

820

821

Args:

822

diffs: Array of diff tuples.

823

824

"""

825

changes = False

826

equalities = [] # Stack of indices where equalities are found.

827

lastequality = '' # Always equal to equalities[-1][1]

828

pointer = 0 # Index of current position.

829

# Is there an insertion operation before the last equality.

830

pre_ins = False

831

# Is there a deletion operation before the last equality.

832

pre_del = False

833

# Is there an insertion operation after the last equality.

834

post_ins = False

835

# Is there a deletion operation after the last equality.

836

post_del = False

837

while pointer < len(diffs):

838

if diffs[pointer][0] == self.DIFF_EQUAL: # equality found

839

if (len(diffs[pointer][1]) < self.Diff_EditCost and

840

(post_ins or post_del)):

841

# Candidate found.

842

equalities.append(pointer)

843

pre_ins = post_ins

844

pre_del = post_del

845

lastequality = diffs[pointer][1]

846

else:

847

# Not a candidate, and can never become one.

848

equalities = []

849

lastequality = ''

850

851

post_ins = post_del = False

852

else: # an insertion or deletion

853

if diffs[pointer][0] == self.DIFF_DELETE:

854

post_del = True

855

else:

856

post_ins = True

857

858

# Five types to be split:

859

# <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>

860

# <ins>A</ins>X<ins>C</ins><del>D</del>

861

# <ins>A</ins><del>B</del>X<ins>C</ins>

862

# <ins>A</del>X<ins>C</ins><del>D</del>

863

# <ins>A</ins><del>B</del>X<del>C</del>

864

865

if lastequality and ((pre_ins and pre_del and post_ins and post_del) or

866

((len(lastequality) < self.Diff_EditCost / 2) and

867

(pre_ins + pre_del + post_ins + post_del) == 3)):

868

# Duplicate record

869

diffs.insert(equalities[-1],

870

(self.DIFF_DELETE, lastequality))

871

# Change second copy to insert.

872

diffs[equalities[-1] + 1] = (self.DIFF_INSERT,

873

diffs[equalities[-1] + 1][1])

874

equalities.pop() # Throw away the equality we just deleted

875

lastequality = ''

876

if pre_ins and pre_del:

877

# No changes made which could affect previous entry,

878

# keep going.

879

post_ins = post_del = True

880

equalities = []

881

else:

882

if len(equalities):

883

equalities.pop() # Throw away the previous equality

884

if len(equalities):

885

pointer = equalities[-1]

886

else:

887

pointer = -1

888

post_ins = post_del = False

889

changes = True

890

pointer += 1

891

892

if changes:

893

self.diff_cleanupMerge(diffs)

894

895

def diff_cleanupMerge(self, diffs):

896

"""Reorder and merge like edit sections. Merge equalities. Any edit

897

section can move as long as it doesn't cross an equality.

898

899

Args:

900

diffs: Array of diff tuples.

901

902

"""

903

diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end.

904

pointer = 0

905

count_delete = 0

920

906

count_insert = 0

921

count_delete = 0

922

907

text_delete = ''

923

908

text_insert = ''

924

925

if diffs[-1][1] == '':

926

diffs.pop() # Remove the dummy entry at the end.

927

928

# Second pass: look for single edits surrounded on both sides by equalities

929

# which can be shifted sideways to eliminate an equality.

930

# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC

931

changes = False

932

pointer = 1

933

# Intentionally ignore the first and last element (don't need checking).

934

while pointer < len(diffs) - 1:

935

if (diffs[pointer - 1][0] == self.DIFF_EQUAL and

936

diffs[pointer + 1][0] == self.DIFF_EQUAL):

937

# This is a single edit surrounded by equalities.

938

if diffs[pointer][1].endswith(diffs[pointer - 1][1]):

939

# Shift the edit over the previous equality.

940

diffs[pointer] = (diffs[pointer][0],

941

diffs[pointer - 1][1] +

942

diffs[pointer][1][:-len(diffs[pointer - 1][1])])

943

diffs[pointer + 1] = (diffs[pointer + 1][0],

944

diffs[pointer - 1][1] + diffs[pointer + 1][1])

945

del diffs[pointer - 1]

946

changes = True

947

elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):

948

# Shift the edit over the next equality.

949

diffs[pointer - 1] = (diffs[pointer - 1][0],

950

diffs[pointer - 1][1] + diffs[pointer + 1][1])

951

diffs[pointer] = (diffs[pointer][0],

952

diffs[pointer][1][len(diffs[pointer + 1][1]):] +

953

diffs[pointer + 1][1])

954

del diffs[pointer + 1]

955

changes = True

956

pointer += 1

957

958

# If shifts were made, the diff needs reordering and another shift sweep.

959

if changes:

960

self.diff_cleanupMerge(diffs)

961

962

def diff_xIndex(self, diffs, loc):

963

"""loc is a location in text1, compute and return the equivalent location

964

in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8

965

966

Args:

967

diffs: Array of diff tuples.

968

loc: Location within text1.

969

970

Returns:

971

Location within text2.

972

"""

973

chars1 = 0

974

chars2 = 0

975

last_chars1 = 0

976

last_chars2 = 0

977

for x in xrange(len(diffs)):

978

(op, text) = diffs[x]

979

if op != self.DIFF_INSERT: # Equality or deletion.

980

chars1 += len(text)

981

if op != self.DIFF_DELETE: # Equality or insertion.

982

chars2 += len(text)

983

if chars1 > loc: # Overshot the location.

984

break

985

last_chars1 = chars1

986

last_chars2 = chars2

987

988

if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:

989

# The location was deleted.

990

return last_chars2

991

# Add the remaining len(character).

992

return last_chars2 + (loc - last_chars1)

993

994

def diff_prettyHtml(self, diffs):

995

"""Convert a diff array into a pretty HTML report.

996

997

Args:

998

diffs: Array of diff tuples.

999

1000

Returns:

1001

HTML representation.

1002

"""

1003

html = []

1004

i = 0

1005

for (op, data) in diffs:

1006

text = (data.replace("&", "&").replace("<", "<")

1007

.replace(">", ">").replace("\n", "¶ "))

1008

if op == self.DIFF_INSERT:

1009

html.append("%s"

1010

% (i, text))

1011

elif op == self.DIFF_DELETE:

1012

html.append("%s"

1013

% (i, text))

1014

elif op == self.DIFF_EQUAL:

1015

html.append("%s" % (i, text))

1016

if op != self.DIFF_DELETE:

1017

i += len(data)

1018

return "".join(html)

1019

1020

def diff_text1(self, diffs):

1021

"""Compute and return the source text (all equalities and deletions).

1022

1023

Args:

1024

diffs: Array of diff tuples.

1025

1026

Returns:

1027

Source text.

1028

"""

1029

text = []

1030

for (op, data) in diffs:

1031

if op != self.DIFF_INSERT:

1032

text.append(data)

1033

return "".join(text)

1034

1035

def diff_text2(self, diffs):

1036

"""Compute and return the destination text (all equalities and insertions).

1037

1038

Args:

1039

diffs: Array of diff tuples.

1040

1041

Returns:

1042

Destination text.

1043

"""

1044

text = []

1045

for (op, data) in diffs:

1046

if op != self.DIFF_DELETE:

1047

text.append(data)

1048

return "".join(text)

1049

1050

def diff_levenshtein(self, diffs):

1051

"""Compute the Levenshtein distance; the number of inserted, deleted or

1052

substituted characters.

1053

1054

Args:

1055

diffs: Array of diff tuples.

1056

1057

Returns:

1058

Number of changes.

1059

"""

1060

levenshtein = 0

1061

insertions = 0

1062

deletions = 0

1063

for (op, data) in diffs:

1064

if op == self.DIFF_INSERT:

1065

insertions += len(data)

1066

elif op == self.DIFF_DELETE:

1067

deletions += len(data)

1068

elif op == self.DIFF_EQUAL:

1069

# A deletion and an insertion is one substitution.

1070

levenshtein += max(insertions, deletions)

909

while pointer < len(diffs):

910

if diffs[pointer][0] == self.DIFF_INSERT:

911

count_insert += 1

912

text_insert += diffs[pointer][1]

913

pointer += 1

914

elif diffs[pointer][0] == self.DIFF_DELETE:

915

count_delete += 1

916

text_delete += diffs[pointer][1]

917

pointer += 1

918

elif diffs[pointer][0] == self.DIFF_EQUAL:

919

# Upon reaching an equality, check for prior redundancies.

920

if count_delete != 0 or count_insert != 0:

921

if count_delete != 0 and count_insert != 0:

922

# Factor out any common prefixies.

923

commonlength = self.diff_commonPrefix(

924

text_insert, text_delete)

925

if commonlength != 0:

926

x = pointer - count_delete - count_insert - 1

927

if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:

928

diffs[x] = (diffs[x][0], diffs[x][1] +

929

text_insert[:commonlength])

930

else:

931

diffs.insert(

932

0, (self.DIFF_EQUAL, text_insert[:commonlength]))

933

pointer += 1

934

text_insert = text_insert[commonlength:]

935

text_delete = text_delete[commonlength:]

936

# Factor out any common suffixies.

937

commonlength = self.diff_commonSuffix(

938

text_insert, text_delete)

939

if commonlength != 0:

940

diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] +

941

diffs[pointer][1])

942

text_insert = text_insert[:-commonlength]

943

text_delete = text_delete[:-commonlength]

944

# Delete the offending records and add the merged ones.

945

if count_delete == 0:

946

diffs[pointer - count_insert: pointer] = [

947

(self.DIFF_INSERT, text_insert)]

948

elif count_insert == 0:

949

diffs[pointer - count_delete: pointer] = [

950

(self.DIFF_DELETE, text_delete)]

951

else:

952

diffs[pointer - count_delete - count_insert: pointer] = [

953

(self.DIFF_DELETE, text_delete),

954

(self.DIFF_INSERT, text_insert)]

955

pointer = pointer - count_delete - count_insert + 1

956

if count_delete != 0:

957

pointer += 1

958

if count_insert != 0:

959

pointer += 1

960

elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:

961

# Merge this equality with the previous one.

962

diffs[pointer - 1] = (diffs[pointer - 1][0],

963

diffs[pointer - 1][1] + diffs[pointer][1])

964

del diffs[pointer]

965

else:

966

pointer += 1

967

968

count_insert = 0

969

count_delete = 0

970

text_delete = ''

971

text_insert = ''

972

973

if diffs[-1][1] == '':

974

diffs.pop() # Remove the dummy entry at the end.

975

976

# Second pass: look for single edits surrounded on both sides by equalities

977

# which can be shifted sideways to eliminate an equality.

978

# e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC

979

changes = False

980

pointer = 1

981

# Intentionally ignore the first and last element (don't need

982

# checking).

983

while pointer < len(diffs) - 1:

984

if (diffs[pointer - 1][0] == self.DIFF_EQUAL and

985

diffs[pointer + 1][0] == self.DIFF_EQUAL):

986

# This is a single edit surrounded by equalities.

987

if diffs[pointer][1].endswith(diffs[pointer - 1][1]):

988

# Shift the edit over the previous equality.

989

diffs[pointer] = (diffs[pointer][0],

990

diffs[pointer - 1][1] +

991

diffs[pointer][1][:-len(diffs[pointer - 1][1])])

992

diffs[pointer + 1] = (diffs[pointer + 1][0],

993

diffs[pointer - 1][1] + diffs[pointer + 1][1])

994

del diffs[pointer - 1]

995

changes = True

996

elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):

997

# Shift the edit over the next equality.

998

diffs[pointer - 1] = (diffs[pointer - 1][0],

999

diffs[pointer - 1][1] + diffs[pointer + 1][1])

1000

diffs[pointer] = (diffs[pointer][0],

1001

diffs[pointer][1][len(diffs[pointer + 1][1]):] +

1002

diffs[pointer + 1][1])

1003

del diffs[pointer + 1]

1004

changes = True

1005

pointer += 1

1006

1007

# If shifts were made, the diff needs reordering and another shift

1008

# sweep.

1009

if changes:

1010

self.diff_cleanupMerge(diffs)

1011

1012

def diff_xIndex(self, diffs, loc):

1013

"""loc is a location in text1, compute and return the equivalent location

1014

in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8

1015

1016

Args:

1017

diffs: Array of diff tuples.

1018

loc: Location within text1.

1019

1020

Returns:

1021

Location within text2.

1022

"""

1023

chars1 = 0

1024

chars2 = 0

1025

last_chars1 = 0

1026

last_chars2 = 0

1027

for x in xrange(len(diffs)):

1028

(op, text) = diffs[x]

1029

if op != self.DIFF_INSERT: # Equality or deletion.

1030

chars1 += len(text)

1031

if op != self.DIFF_DELETE: # Equality or insertion.

1032

chars2 += len(text)

1033

if chars1 > loc: # Overshot the location.

1034

break

1035

last_chars1 = chars1

1036

last_chars2 = chars2

1037

1038

if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:

1039

# The location was deleted.

1040

return last_chars2

1041

# Add the remaining len(character).

1042

return last_chars2 + (loc - last_chars1)

1043

1044

def diff_prettyHtml(self, diffs):

1045

"""Convert a diff array into a pretty HTML report.

1046

1047

Args:

1048

diffs: Array of diff tuples.

1049

1050

Returns:

1051

HTML representation.

1052

1053

"""

1054

html = []

1055

i = 0

1056

for (op, data) in diffs:

1057

text = (data.replace('&', '&').replace('<', '<')

1058

.replace('>', '>').replace('\n', '¶ '))

1059

if op == self.DIFF_INSERT:

1060

html.append("%s"

1061

% (i, text))

1062

elif op == self.DIFF_DELETE:

1063

html.append("%s"

1064

% (i, text))

1065

elif op == self.DIFF_EQUAL:

1066

html.append("%s" % (i, text))

1067

if op != self.DIFF_DELETE:

1068

i += len(data)

1069

return ''.join(html)

1070

1071

def diff_text1(self, diffs):

1072

"""Compute and return the source text (all equalities and deletions).

1073

1074

Args:

1075

diffs: Array of diff tuples.

1076

1077

Returns:

1078

Source text.

1079

1080

"""

1081

text = []

1082

for (op, data) in diffs:

1083

if op != self.DIFF_INSERT:

1084

text.append(data)

1085

return ''.join(text)

1086

1087

def diff_text2(self, diffs):

1088

"""Compute and return the destination text (all equalities and

1089

insertions).

1090

1091

Args:

1092

diffs: Array of diff tuples.

1093

1094

Returns:

1095

Destination text.

1096

1097

"""

1098

text = []

1099

for (op, data) in diffs:

1100

if op != self.DIFF_DELETE:

1101

text.append(data)

1102

return ''.join(text)

1103

1104

def diff_levenshtein(self, diffs):

1105

"""Compute the Levenshtein distance; the number of inserted, deleted or

1106

substituted characters.

1107

1108

Args:

1109

diffs: Array of diff tuples.

1110

1111

Returns:

1112

Number of changes.

1113

1114

"""

1115

levenshtein = 0

1071

1116

insertions = 0

1072

1117

deletions = 0

1073

levenshtein += max(insertions, deletions)

1074

return levenshtein

1075

1076

def diff_toDelta(self, diffs):

1077

"""Crush the diff into an encoded string which describes the operations

1078

required to transform text1 into text2.

1079

E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.

1080

Operations are tab-separated. Inserted text is escaped using %xx notation.

1081

1082

Args:

1083

diffs: Array of diff tuples.

1084

1085

Returns:

1086

Delta text.

1087

"""

1088

text = []

1089

for (op, data) in diffs:

1090

if op == self.DIFF_INSERT:

1091

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

1092

data = data.encode("utf-8")

1093

text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))

1094

elif op == self.DIFF_DELETE:

1095

text.append("-%d" % len(data))

1096

elif op == self.DIFF_EQUAL:

1097

text.append("=%d" % len(data))

1098

return "\t".join(text)

1099

1100

def diff_fromDelta(self, text1, delta):

1101

"""Given the original text1, and an encoded string which describes the

1102

operations required to transform text1 into text2, compute the full diff.

1103

1104

Args:

1105

text1: Source string for the diff.

1106

delta: Delta text.

1107

1108

Returns:

1109

Array of diff tuples.

1110

1111

Raises:

1112

ValueError: If invalid input.

1113

"""

1114

if type(delta) == unicode:

1115

# Deltas should be composed of a subset of ascii chars, Unicode not

1116

# required. If this encode raises UnicodeEncodeError, delta is invalid.

1117

delta = delta.encode("ascii")

1118

diffs = []

1119

pointer = 0 # Cursor in text1

1120

tokens = delta.split("\t")

1121

for token in tokens:

1122

if token == "":

1123

# Blank tokens are ok (from a trailing \t).

1124

continue

1125

# Each token begins with a one character parameter which specifies the

1126

# operation of this token (delete, insert, equality).

1127

param = token[1:]

1128

if token[0] == "+":

1129

param = urllib.unquote(param).decode("utf-8")

1130

diffs.append((self.DIFF_INSERT, param))

1131

elif token[0] == "-" or token[0] == "=":

1132

try:

1133

n = int(param)

1134

except ValueError:

1135

raise ValueError, "Invalid number in diff_fromDelta: " + param

1136

if n < 0:

1137

raise ValueError, "Negative number in diff_fromDelta: " + param

1138

text = text1[pointer : pointer + n]

1139

pointer += n

1140

if token[0] == "=":

1141

diffs.append((self.DIFF_EQUAL, text))

1142

else:

1143

diffs.append((self.DIFF_DELETE, text))

1144

else:

1145

# Anything else is an error.

1146

raise ValueError, ("Invalid diff operation in diff_fromDelta: " +

1147

token[0])

1148

if pointer != len(text1):

1149

raise ValueError, (

1150

"Delta length (%d) does not equal source text length (%d)." %

1151

(pointer, len(text1)))

1152

return diffs

1153

1154

# MATCH FUNCTIONS

1155

1156

def match_main(self, text, pattern, loc):

1157

"""Locate the best instance of 'pattern' in 'text' near 'loc'.

1158

1159

Args:

1160

text: The text to search.

1161

pattern: The pattern to search for.

1162

loc: The location to search around.

1163

1164

Returns:

1165

Best match index or -1.

1166

"""

1167

loc = max(0, min(loc, len(text)))

1168

if text == pattern:

1169

# Shortcut (potentially not guaranteed by the algorithm)

1170

return 0

1171

elif not text:

1172

# Nothing to match.

1173

return -1

1174

elif text[loc:loc + len(pattern)] == pattern:

1175

# Perfect match at the perfect spot! (Includes case of null pattern)

1176

return loc

1177

else:

1178

# Do a fuzzy compare.

1179

match = self.match_bitap(text, pattern, loc)

1180

return match

1181

1182

def match_bitap(self, text, pattern, loc):

1183

"""Locate the best instance of 'pattern' in 'text' near 'loc' using the

1184

Bitap algorithm.

1185

1186

Args:

1187

text: The text to search.

1188

pattern: The pattern to search for.

1189

loc: The location to search around.

1190

1191

Returns:

1192

Best match index or -1.

1193

"""

1194

# Python doesn't have a maxint limit, so ignore this check.

1195

#if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:

1196

# raise ValueError("Pattern too long for this application.")

1197

1198

# Initialise the alphabet.

1199

s = self.match_alphabet(pattern)

1200

1201

def match_bitapScore(e, x):

1202

"""Compute and return the score for a match with e errors and x location.

1203

Accesses loc and pattern through being a closure.

1204

1205

Args:

1206

e: Number of errors in match.

1207

x: Location of match.

1208

1209

Returns:

1210

Overall score for match (0.0 = good, 1.0 = bad).

1211

"""

1212

accuracy = float(e) / len(pattern)

1213

proximity = abs(loc - x)

1214

if not self.Match_Distance:

1215

# Dodge divide by zero error.

1216

return proximity and 1.0 or accuracy

1217

return accuracy + (proximity / float(self.Match_Distance))

1218

1219

# Highest score beyond which we give up.

1220

score_threshold = self.Match_Threshold

1221

# Is there a nearby exact match? (speedup)

1222

best_loc = text.find(pattern, loc)

1223

if best_loc != -1:

1224

score_threshold = min(match_bitapScore(0, best_loc), score_threshold)

1225

# What about in the other direction? (speedup)

1226

best_loc = text.rfind(pattern, loc + len(pattern))

1227

if best_loc != -1:

1228

score_threshold = min(match_bitapScore(0, best_loc), score_threshold)

1229

1230

# Initialise the bit arrays.

1231

matchmask = 1 << (len(pattern) - 1)

1232

best_loc = -1

1233

1234

bin_max = len(pattern) + len(text)

1235

# Empty initialization added to appease pychecker.

1236

last_rd = None

1237

for d in xrange(len(pattern)):

1238

# Scan for the best match each iteration allows for one more error.

1239

# Run a binary search to determine how far from 'loc' we can stray at

1240

# this error level.

1241

bin_min = 0

1242

bin_mid = bin_max

1243

while bin_min < bin_mid:

1244

if match_bitapScore(d, loc + bin_mid) <= score_threshold:

1245

bin_min = bin_mid

1246

else:

1247

bin_max = bin_mid

1248

bin_mid = (bin_max - bin_min) / 2 + bin_min

1249

1250

# Use the result from this iteration as the maximum for the next.

1251

bin_max = bin_mid

1252

start = max(1, loc - bin_mid + 1)

1253

finish = min(loc + bin_mid, len(text)) + len(pattern)

1254

1255

rd = range(finish + 1)

1256

rd.append((1 << d) - 1)

1257

for j in xrange(finish, start - 1, -1):

1258

if len(text) <= j - 1:

1259

# Out of range.

1260

charMatch = 0

1261

else:

1262

charMatch = s.get(text[j - 1], 0)

1263

if d == 0: # First pass: exact match.

1264

rd[j] = ((rd[j + 1] << 1) | 1) & charMatch

1265

else: # Subsequent passes: fuzzy match.

1266

rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (

1267

((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]

1268

if rd[j] & matchmask:

1269

score = match_bitapScore(d, j - 1)

1270

# This match will almost certainly be better than any existing match.

1271

# But check anyway.

1272

if score <= score_threshold:

1273

# Told you so.

1274

score_threshold = score

1275

best_loc = j - 1

1276

if best_loc > loc:

1277

# When passing loc, don't exceed our current distance from loc.

1278

start = max(1, 2 * loc - best_loc)

1118

for (op, data) in diffs:

1119

if op == self.DIFF_INSERT:

1120

insertions += len(data)

1121

elif op == self.DIFF_DELETE:

1122

deletions += len(data)

1123

elif op == self.DIFF_EQUAL:

1124

# A deletion and an insertion is one substitution.

1125

levenshtein += max(insertions, deletions)

1126

insertions = 0

1127

deletions = 0

1128

levenshtein += max(insertions, deletions)

1129

return levenshtein

1130

1131

def diff_toDelta(self, diffs):

1132

"""Crush the diff into an encoded string which describes the operations

1133

required to transform text1 into text2.

1134

E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.

1135

Operations are tab-separated. Inserted text is escaped using %xx notation.

1136

1137

Args:

1138

diffs: Array of diff tuples.

1139

1140

Returns:

1141

Delta text.

1142

"""

1143

text = []

1144

for (op, data) in diffs:

1145

if op == self.DIFF_INSERT:

1146

# High ascii will raise UnicodeDecodeError. Use Unicode

1147

# instead.

1148

data = data.encode('utf-8')

1149

text.append('+' + urllib.quote(data, "!~*'();/?:@&=+$,# "))

1150

elif op == self.DIFF_DELETE:

1151

text.append('-%d' % len(data))

1152

elif op == self.DIFF_EQUAL:

1153

text.append('=%d' % len(data))

1154

return '\t'.join(text)

1155

1156

def diff_fromDelta(self, text1, delta):

1157

"""Given the original text1, and an encoded string which describes the

1158

operations required to transform text1 into text2, compute the full

1159

diff.

1160

1161

Args:

1162

text1: Source string for the diff.

1163

delta: Delta text.

1164

1165

Returns:

1166

Array of diff tuples.

1167

1168

Raises:

1169

ValueError: If invalid input.

1170

1171

"""

1172

if type(delta) == unicode:

1173

# Deltas should be composed of a subset of ascii chars, Unicode not

1174

# required. If this encode raises UnicodeEncodeError, delta is

1175

# invalid.

1176

delta = delta.encode('ascii')

1177

diffs = []

1178

pointer = 0 # Cursor in text1

1179

tokens = delta.split('\t')

1180

for token in tokens:

1181

if token == '':

1182

# Blank tokens are ok (from a trailing \t).

1183

continue

1184

# Each token begins with a one character parameter which specifies the

1185

# operation of this token (delete, insert, equality).

1186

param = token[1:]

1187

if token[0] == '+':

1188

param = urllib.unquote(param).decode('utf-8')

1189

diffs.append((self.DIFF_INSERT, param))

1190

elif token[0] == '-' or token[0] == '=':

1191

try:

1192

n = int(param)

1193

except ValueError:

1194

raise ValueError, 'Invalid number in diff_fromDelta: ' + param

1195

if n < 0:

1196

raise ValueError, 'Negative number in diff_fromDelta: ' + param

1197

text = text1[pointer: pointer + n]

1198

pointer += n

1199

if token[0] == '=':

1200

diffs.append((self.DIFF_EQUAL, text))

1201

else:

1202

diffs.append((self.DIFF_DELETE, text))

1279

1203

else:

1280

# Already passed loc, downhill from here on in.

1281

break

1282

# No hope for a (better) match at greater error levels.

1283

if match_bitapScore(d + 1, loc) > score_threshold:

1284

break

1285

last_rd = rd

1286

return best_loc

1287

1288

def match_alphabet(self, pattern):

1289

"""Initialise the alphabet for the Bitap algorithm.

1290

1291

Args:

1292

pattern: The text to encode.

1293

1294

Returns:

1295

Hash of character locations.

1296

"""

1297

s = {}

1298

for char in pattern:

1299

s[char] = 0

1300

for i in xrange(len(pattern)):

1301

s[pattern[i]] |= 1 << (len(pattern) - i - 1)

1302

return s

1303

1304

# PATCH FUNCTIONS

1305

1306

def patch_addContext(self, patch, text):

1307

"""Increase the context until it is unique,

1308

but don't let the pattern expand beyond Match_MaxBits.

1309

1310

Args:

1311

patch: The patch to grow.

1312

text: Source text.

1313

"""

1314

if len(text) == 0:

1315

return

1316

pattern = text[patch.start2 : patch.start2 + patch.length1]

1317

padding = 0

1318

1319

# Look for the first and last matches of pattern in text. If two different

1320

# matches are found, increase the pattern length.

1321

while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==

1322

0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -

1323

self.Patch_Margin)):

1324

padding += self.Patch_Margin

1325

pattern = text[max(0, patch.start2 - padding) :

1326

patch.start2 + patch.length1 + padding]

1327

# Add one chunk for good luck.

1328

padding += self.Patch_Margin

1329

1330

# Add the prefix.

1331

prefix = text[max(0, patch.start2 - padding) : patch.start2]

1332

if prefix:

1333

patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]

1334

# Add the suffix.

1335

suffix = text[patch.start2 + patch.length1 :

1336

patch.start2 + patch.length1 + padding]

1337

if suffix:

1338

patch.diffs.append((self.DIFF_EQUAL, suffix))

1339

1340

# Roll back the start points.

1341

patch.start1 -= len(prefix)

1342

patch.start2 -= len(prefix)

1343

# Extend lengths.

1344

patch.length1 += len(prefix) + len(suffix)

1345

patch.length2 += len(prefix) + len(suffix)

1346

1347

def patch_make(self, a, b=None, c=None):

1348

"""Compute a list of patches to turn text1 into text2.

1349

Use diffs if provided, otherwise compute it ourselves.

1350

There are four ways to call this function, depending on what data is

1351

available to the caller:

1352

Method 1:

1353

a = text1, b = text2

1354

Method 2:

1355

a = diffs

1356

Method 3 (optimal):

1357

a = text1, b = diffs

1358

Method 4 (deprecated, use method 3):

1359

a = text1, b = text2, c = diffs

1360

1361

Args:

1362

a: text1 (methods 1,3,4) or Array of diff tuples for text1 to

1363

text2 (method 2).

1364

b: text2 (methods 1,4) or Array of diff tuples for text1 to

1365

text2 (method 3) or undefined (method 2).

1366

c: Array of diff tuples for text1 to text2 (method 4) or

1367

undefined (methods 1,2,3).

1368

1369

Returns:

1370

Array of patch objects.

1371

"""

1372

text1 = None

1373

diffs = None

1374

# Note that texts may arrive as 'str' or 'unicode'.

1375

if isinstance(a, basestring) and isinstance(b, basestring) and c is None:

1376

# Method 1: text1, text2

1377

# Compute diffs from text1 and text2.

1378

text1 = a

1379

diffs = self.diff_main(text1, b, True)

1380

if len(diffs) > 2:

1381

self.diff_cleanupSemantic(diffs)

1382

self.diff_cleanupEfficiency(diffs)

1383

elif isinstance(a, list) and b is None and c is None:

1384

# Method 2: diffs

1385

# Compute text1 from diffs.

1386

diffs = a

1387

text1 = self.diff_text1(diffs)

1388

elif isinstance(a, basestring) and isinstance(b, list) and c is None:

1389

# Method 3: text1, diffs

1390

text1 = a

1391

diffs = b

1392

elif (isinstance(a, basestring) and isinstance(b, basestring) and

1393

isinstance(c, list)):

1394

# Method 4: text1, text2, diffs

1395

# text2 is not used.

1396

text1 = a

1397

diffs = c

1398

else:

1399

raise ValueError("Unknown call format to patch_make.")

1400

1401

if not diffs:

1402

return [] # Get rid of the None case.

1403

patches = []

1404

patch = patch_obj()

1405

char_count1 = 0 # Number of characters into the text1 string.

1406

char_count2 = 0 # Number of characters into the text2 string.

1407

prepatch_text = text1 # Recreate the patches to determine context info.

1408

postpatch_text = text1

1409

for x in xrange(len(diffs)):

1410

(diff_type, diff_text) = diffs[x]

1411

if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:

1412

# A new patch starts here.

1413

patch.start1 = char_count1

1414

patch.start2 = char_count2

1415

if diff_type == self.DIFF_INSERT:

1416

# Insertion

1417

patch.diffs.append(diffs[x])

1418

patch.length2 += len(diff_text)

1419

postpatch_text = (postpatch_text[:char_count2] + diff_text +

1420

postpatch_text[char_count2:])

1421

elif diff_type == self.DIFF_DELETE:

1422

# Deletion.

1423

patch.length1 += len(diff_text)

1424

patch.diffs.append(diffs[x])

1425

postpatch_text = (postpatch_text[:char_count2] +

1426

postpatch_text[char_count2 + len(diff_text):])

1427

elif (diff_type == self.DIFF_EQUAL and

1428

len(diff_text) <= 2 * self.Patch_Margin and

1429

len(patch.diffs) != 0 and len(diffs) != x + 1):

1430

# Small equality inside a patch.

1431

patch.diffs.append(diffs[x])

1432

patch.length1 += len(diff_text)

1433

patch.length2 += len(diff_text)

1434

1435

if (diff_type == self.DIFF_EQUAL and

1436

len(diff_text) >= 2 * self.Patch_Margin):

1437

# Time for a new patch.

1204

# Anything else is an error.

1205

raise ValueError, ('Invalid diff operation in diff_fromDelta: ' +

1206

token[0])

1207

if pointer != len(text1):

1208

raise ValueError, (

1209

'Delta length (%d) does not equal source text length (%d).' %

1210

(pointer, len(text1)))

1211

return diffs

1212

1213

# MATCH FUNCTIONS

1214

1215

def match_main(self, text, pattern, loc):

1216

"""Locate the best instance of 'pattern' in 'text' near 'loc'.

1217

1218

Args:

1219

text: The text to search.

1220

pattern: The pattern to search for.

1221

loc: The location to search around.

1222

1223

Returns:

1224

Best match index or -1.

1225

1226

"""

1227

loc = max(0, min(loc, len(text)))

1228

if text == pattern:

1229

# Shortcut (potentially not guaranteed by the algorithm)

1230

return 0

1231

elif not text:

1232

# Nothing to match.

1233

return -1

1234

elif text[loc:loc + len(pattern)] == pattern:

1235

# Perfect match at the perfect spot! (Includes case of null

1236

# pattern)

1237

return loc

1238

else:

1239

# Do a fuzzy compare.

1240

match = self.match_bitap(text, pattern, loc)

1241

return match

1242

1243

def match_bitap(self, text, pattern, loc):

1244

"""Locate the best instance of 'pattern' in 'text' near 'loc' using the

1245

Bitap algorithm.

1246

1247

Args:

1248

text: The text to search.

1249

pattern: The pattern to search for.

1250

loc: The location to search around.

1251

1252

Returns:

1253

Best match index or -1.

1254

1255

"""

1256

# Python doesn't have a maxint limit, so ignore this check.

1257

# if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:

1258

# raise ValueError("Pattern too long for this application.")

1259

1260

# Initialise the alphabet.

1261

s = self.match_alphabet(pattern)

1262

1263

def match_bitapScore(e, x):

1264

"""Compute and return the score for a match with e errors and x

1265

location. Accesses loc and pattern through being a closure.

1266

1267

Args:

1268

e: Number of errors in match.

1269

x: Location of match.

1270

1271

Returns:

1272

Overall score for match (0.0 = good, 1.0 = bad).

1273

1274

"""

1275

accuracy = float(e) / len(pattern)

1276

proximity = abs(loc - x)

1277

if not self.Match_Distance:

1278

# Dodge divide by zero error.

1279

return proximity and 1.0 or accuracy

1280

return accuracy + (proximity / float(self.Match_Distance))

1281

1282

# Highest score beyond which we give up.

1283

score_threshold = self.Match_Threshold

1284

# Is there a nearby exact match? (speedup)

1285

best_loc = text.find(pattern, loc)

1286

if best_loc != -1:

1287

score_threshold = min(match_bitapScore(

1288

0, best_loc), score_threshold)

1289

# What about in the other direction? (speedup)

1290

best_loc = text.rfind(pattern, loc + len(pattern))

1291

if best_loc != -1:

1292

score_threshold = min(match_bitapScore(

1293

0, best_loc), score_threshold)

1294

1295

# Initialise the bit arrays.

1296

matchmask = 1 << (len(pattern) - 1)

1297

best_loc = -1

1298

1299

bin_max = len(pattern) + len(text)

1300

# Empty initialization added to appease pychecker.

1301

last_rd = None

1302

for d in xrange(len(pattern)):

1303

# Scan for the best match each iteration allows for one more error.

1304

# Run a binary search to determine how far from 'loc' we can stray at

1305

# this error level.

1306

bin_min = 0

1307

bin_mid = bin_max

1308

while bin_min < bin_mid:

1309

if match_bitapScore(d, loc + bin_mid) <= score_threshold:

1310

bin_min = bin_mid

1311

else:

1312

bin_max = bin_mid

1313

bin_mid = (bin_max - bin_min) / 2 + bin_min

1314

1315

# Use the result from this iteration as the maximum for the next.

1316

bin_max = bin_mid

1317

start = max(1, loc - bin_mid + 1)

1318

finish = min(loc + bin_mid, len(text)) + len(pattern)

1319

1320

rd = range(finish + 1)

1321

rd.append((1 << d) - 1)

1322

for j in xrange(finish, start - 1, -1):

1323

if len(text) <= j - 1:

1324

# Out of range.

1325

charMatch = 0

1326

else:

1327

charMatch = s.get(text[j - 1], 0)

1328

if d == 0: # First pass: exact match.

1329

rd[j] = ((rd[j + 1] << 1) | 1) & charMatch

1330

else: # Subsequent passes: fuzzy match.

1331

rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (

1332

((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]

1333

if rd[j] & matchmask:

1334

score = match_bitapScore(d, j - 1)

1335

# This match will almost certainly be better than any existing match.

1336

# But check anyway.

1337

if score <= score_threshold:

1338

# Told you so.

1339

score_threshold = score

1340

best_loc = j - 1

1341

if best_loc > loc:

1342

# When passing loc, don't exceed our current

1343

# distance from loc.

1344

start = max(1, 2 * loc - best_loc)

1345

else:

1346

# Already passed loc, downhill from here on in.

1347

break

1348

# No hope for a (better) match at greater error levels.

1349

if match_bitapScore(d + 1, loc) > score_threshold:

1350

break

1351

last_rd = rd

1352

return best_loc

1353

1354

def match_alphabet(self, pattern):

1355

"""Initialise the alphabet for the Bitap algorithm.

1356

1357

Args:

1358

pattern: The text to encode.

1359

1360

Returns:

1361

Hash of character locations.

1362

1363

"""

1364

s = {}

1365

for char in pattern:

1366

s[char] = 0

1367

for i in xrange(len(pattern)):

1368

s[pattern[i]] |= 1 << (len(pattern) - i - 1)

1369

return s

1370

1371

# PATCH FUNCTIONS

1372

1373

def patch_addContext(self, patch, text):

1374

"""Increase the context until it is unique, but don't let the pattern

1375

expand beyond Match_MaxBits.

1376

1377

Args:

1378

patch: The patch to grow.

1379

text: Source text.

1380

1381

"""

1382

if len(text) == 0:

1383

return

1384

pattern = text[patch.start2: patch.start2 + patch.length1]

1385

padding = 0

1386

1387

# Look for the first and last matches of pattern in text. If two different

1388

# matches are found, increase the pattern length.

1389

while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits ==

1390

0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin -

1391

self.Patch_Margin)):

1392

padding += self.Patch_Margin

1393

pattern = text[max(0, patch.start2 - padding):

1394

patch.start2 + patch.length1 + padding]

1395

# Add one chunk for good luck.

1396

padding += self.Patch_Margin

1397

1398

# Add the prefix.

1399

prefix = text[max(0, patch.start2 - padding): patch.start2]

1400

if prefix:

1401

patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]

1402

# Add the suffix.

1403

suffix = text[patch.start2 + patch.length1:

1404

patch.start2 + patch.length1 + padding]

1405

if suffix:

1406

patch.diffs.append((self.DIFF_EQUAL, suffix))

1407

1408

# Roll back the start points.

1409

patch.start1 -= len(prefix)

1410

patch.start2 -= len(prefix)

1411

# Extend lengths.

1412

patch.length1 += len(prefix) + len(suffix)

1413

patch.length2 += len(prefix) + len(suffix)

1414

1415

def patch_make(self, a, b=None, c=None):

1416

"""Compute a list of patches to turn text1 into text2.

1417

Use diffs if provided, otherwise compute it ourselves.

1418

There are four ways to call this function, depending on what data is

1419

available to the caller:

1420

Method 1:

1421

a = text1, b = text2

1422

Method 2:

1423

a = diffs

1424

Method 3 (optimal):

1425

a = text1, b = diffs

1426

Method 4 (deprecated, use method 3):

1427

a = text1, b = text2, c = diffs

1428

1429

Args:

1430

a: text1 (methods 1,3,4) or Array of diff tuples for text1 to

1431

text2 (method 2).

1432

b: text2 (methods 1,4) or Array of diff tuples for text1 to

1433

text2 (method 3) or undefined (method 2).

1434

c: Array of diff tuples for text1 to text2 (method 4) or

1435

undefined (methods 1,2,3).

1436

1437

Returns:

1438

Array of patch objects.

1439

"""

1440

text1 = None

1441

diffs = None

1442

# Note that texts may arrive as 'str' or 'unicode'.

1443

if isinstance(a, basestring) and isinstance(b, basestring) and c is None:

1444

# Method 1: text1, text2

1445

# Compute diffs from text1 and text2.

1446

text1 = a

1447

diffs = self.diff_main(text1, b, True)

1448

if len(diffs) > 2:

1449

self.diff_cleanupSemantic(diffs)

1450

self.diff_cleanupEfficiency(diffs)

1451

elif isinstance(a, list) and b is None and c is None:

1452

# Method 2: diffs

1453

# Compute text1 from diffs.

1454

diffs = a

1455

text1 = self.diff_text1(diffs)

1456

elif isinstance(a, basestring) and isinstance(b, list) and c is None:

1457

# Method 3: text1, diffs

1458

text1 = a

1459

diffs = b

1460

elif (isinstance(a, basestring) and isinstance(b, basestring) and

1461

isinstance(c, list)):

1462

# Method 4: text1, text2, diffs

1463

# text2 is not used.

1464

text1 = a

1465

diffs = c

1466

else:

1467

raise ValueError('Unknown call format to patch_make.')

1468

1469

if not diffs:

1470

return [] # Get rid of the None case.

1471

patches = []

1472

patch = patch_obj()

1473

char_count1 = 0 # Number of characters into the text1 string.

1474

char_count2 = 0 # Number of characters into the text2 string.

1475

# Recreate the patches to determine context info.

1476

prepatch_text = text1

1477

postpatch_text = text1

1478

for x in xrange(len(diffs)):

1479

(diff_type, diff_text) = diffs[x]

1480

if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:

1481

# A new patch starts here.

1482

patch.start1 = char_count1

1483

patch.start2 = char_count2

1484

if diff_type == self.DIFF_INSERT:

1485

# Insertion

1486

patch.diffs.append(diffs[x])

1487

patch.length2 += len(diff_text)

1488

postpatch_text = (postpatch_text[:char_count2] + diff_text +

1489

postpatch_text[char_count2:])

1490

elif diff_type == self.DIFF_DELETE:

1491

# Deletion.

1492

patch.length1 += len(diff_text)

1493

patch.diffs.append(diffs[x])

1494

postpatch_text = (postpatch_text[:char_count2] +

1495

postpatch_text[char_count2 + len(diff_text):])

1496

elif (diff_type == self.DIFF_EQUAL and

1497

len(diff_text) <= 2 * self.Patch_Margin and

1498

len(patch.diffs) != 0 and len(diffs) != x + 1):

1499

# Small equality inside a patch.

1500

patch.diffs.append(diffs[x])

1501

patch.length1 += len(diff_text)

1502

patch.length2 += len(diff_text)

1503

1504

if (diff_type == self.DIFF_EQUAL and

1505

len(diff_text) >= 2 * self.Patch_Margin):

1506

# Time for a new patch.

1507

if len(patch.diffs) != 0:

1508

self.patch_addContext(patch, prepatch_text)

1509

patches.append(patch)

1510

patch = patch_obj()

1511

# Unlike Unidiff, our patch lists have a rolling context.

1512

# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff

1513

# Update prepatch text & pos to reflect the application of the

1514

# just completed patch.

1515

prepatch_text = postpatch_text

1516

char_count1 = char_count2

1517

1518

# Update the current character count.

1519

if diff_type != self.DIFF_INSERT:

1520

char_count1 += len(diff_text)

1521

if diff_type != self.DIFF_DELETE:

1522

char_count2 += len(diff_text)

1523

1524

# Pick up the leftover patch if not empty.

1438

1525

if len(patch.diffs) != 0:

1439

self.patch_addContext(patch, prepatch_text)

1440

patches.append(patch)

1441

patch = patch_obj()

1442

# Unlike Unidiff, our patch lists have a rolling context.

1443

# http://code.google.com/p/google-diff-match-patch/wiki/Unidiff

1444

# Update prepatch text & pos to reflect the application of the

1445

# just completed patch.

1446

prepatch_text = postpatch_text

1447

char_count1 = char_count2

1448

1449

# Update the current character count.

1450

if diff_type != self.DIFF_INSERT:

1451

char_count1 += len(diff_text)

1452

if diff_type != self.DIFF_DELETE:

1453

char_count2 += len(diff_text)

1454

1455

# Pick up the leftover patch if not empty.

1456

if len(patch.diffs) != 0:

1457

self.patch_addContext(patch, prepatch_text)

1458

patches.append(patch)

1459

return patches

1460

1461

def patch_deepCopy(self, patches):

1462

"""Given an array of patches, return another array that is identical.

1463

1464

Args:

1465

patches: Array of patch objects.

1466

1467

Returns:

1468

Array of patch objects.

1469

"""

1470

patchesCopy = []

1471

for patch in patches:

1472

patchCopy = patch_obj()

1473

# No need to deep copy the tuples since they are immutable.

1474

patchCopy.diffs = patch.diffs[:]

1475

patchCopy.start1 = patch.start1

1476

patchCopy.start2 = patch.start2

1477

patchCopy.length1 = patch.length1

1478

patchCopy.length2 = patch.length2

1479

patchesCopy.append(patchCopy)

1480

return patchesCopy

1481

1482

def patch_apply(self, patches, text):

1483

"""Merge a set of patches onto the text. Return a patched text, as well

1484

as a list of true/false values indicating which patches were applied.

1485

1486

Args:

1487

patches: Array of patch objects.

1488

text: Old text.

1489

1490

Returns:

1491

Two element Array, containing the new text and an array of boolean values.

1492

"""

1493

if not patches:

1494

return (text, [])

1495

1496

# Deep copy the patches so that no changes are made to originals.

1497

patches = self.patch_deepCopy(patches)

1498

1499

nullPadding = self.patch_addPadding(patches)

1500

text = nullPadding + text + nullPadding

1501

self.patch_splitMax(patches)

1502

1503

# delta keeps track of the offset between the expected and actual location

1504

# of the previous patch. If there are patches expected at positions 10 and

1505

# 20, but the first patch was found at 12, delta is 2 and the second patch

1506

# has an effective expected position of 22.

1507

delta = 0

1508

results = []

1509

for patch in patches:

1510

expected_loc = patch.start2 + delta

1511

text1 = self.diff_text1(patch.diffs)

1512

end_loc = -1

1513

if len(text1) > self.Match_MaxBits:

1514

# patch_splitMax will only provide an oversized pattern in the case of

1515

# a monster delete.

1516

start_loc = self.match_main(text, text1[:self.Match_MaxBits],

1517

expected_loc)

1518

if start_loc != -1:

1519

end_loc = self.match_main(text, text1[-self.Match_MaxBits:],

1520

expected_loc + len(text1) - self.Match_MaxBits)

1521

if end_loc == -1 or start_loc >= end_loc:

1522

# Can't find valid trailing context. Drop this patch.

1523

start_loc = -1

1524

else:

1525

start_loc = self.match_main(text, text1, expected_loc)

1526

if start_loc == -1:

1527

# No match found. :(

1528

results.append(False)

1529

# Subtract the delta for this failed patch from subsequent patches.

1530

delta -= patch.length2 - patch.length1

1531

else:

1532

# Found a match. :)

1533

results.append(True)

1534

delta = start_loc - expected_loc

1535

if end_loc == -1:

1536

text2 = text[start_loc : start_loc + len(text1)]

1537

else:

1538

text2 = text[start_loc : end_loc + self.Match_MaxBits]

1539

if text1 == text2:

1540

# Perfect match, just shove the replacement text in.

1541

text = (text[:start_loc] + self.diff_text2(patch.diffs) +

1542

text[start_loc + len(text1):])

1543

else:

1544

# Imperfect match.

1545

# Run a diff to get a framework of equivalent indices.

1546

diffs = self.diff_main(text1, text2, False)

1547

if (len(text1) > self.Match_MaxBits and

1548

self.diff_levenshtein(diffs) / float(len(text1)) >

1549

self.Patch_DeleteThreshold):

1550

# The end points match, but the content is unacceptably bad.

1551

results[-1] = False

1552

else:

1553

self.diff_cleanupSemanticLossless(diffs)

1554

index1 = 0

1555

for (op, data) in patch.diffs:

1556

if op != self.DIFF_EQUAL:

1557

index2 = self.diff_xIndex(diffs, index1)

1558

if op == self.DIFF_INSERT: # Insertion

1559

text = text[:start_loc + index2] + data + text[start_loc +

1560

index2:]

1561

elif op == self.DIFF_DELETE: # Deletion

1562

text = text[:start_loc + index2] + text[start_loc +

1563

self.diff_xIndex(diffs, index1 + len(data)):]

1564

if op != self.DIFF_DELETE:

1565

index1 += len(data)

1566

# Strip the padding off.

1567

text = text[len(nullPadding):-len(nullPadding)]

1568

return (text, results)

1569

1570

def patch_addPadding(self, patches):

1571

"""Add some padding on text start and end so that edges can match

1572

something. Intended to be called only from within patch_apply.

1573

1574

Args:

1575

patches: Array of patch objects.

1576

1577

Returns:

1578

The padding string added to each side.

1579

"""

1580

paddingLength = self.Patch_Margin

1581

nullPadding = ""

1582

for x in xrange(1, paddingLength + 1):

1583

nullPadding += chr(x)

1584

1585

# Bump all the patches forward.

1586

for patch in patches:

1587

patch.start1 += paddingLength

1588

patch.start2 += paddingLength

1589

1590

# Add some padding on start of first diff.

1591

patch = patches[0]

1592

diffs = patch.diffs

1593

if not diffs or diffs[0][0] != self.DIFF_EQUAL:

1594

# Add nullPadding equality.

1595

diffs.insert(0, (self.DIFF_EQUAL, nullPadding))

1596

patch.start1 -= paddingLength # Should be 0.

1597

patch.start2 -= paddingLength # Should be 0.

1598

patch.length1 += paddingLength

1599

patch.length2 += paddingLength

1600

elif paddingLength > len(diffs[0][1]):

1601

# Grow first equality.

1602

extraLength = paddingLength - len(diffs[0][1])

1603

newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]

1604

diffs[0] = (diffs[0][0], newText)

1605

patch.start1 -= extraLength

1606

patch.start2 -= extraLength

1607

patch.length1 += extraLength

1608

patch.length2 += extraLength

1609

1610

# Add some padding on end of last diff.

1611

patch = patches[-1]

1612

diffs = patch.diffs

1613

if not diffs or diffs[-1][0] != self.DIFF_EQUAL:

1614

# Add nullPadding equality.

1615

diffs.append((self.DIFF_EQUAL, nullPadding))

1616

patch.length1 += paddingLength

1617

patch.length2 += paddingLength

1618

elif paddingLength > len(diffs[-1][1]):

1619

# Grow last equality.

1620

extraLength = paddingLength - len(diffs[-1][1])

1621

newText = diffs[-1][1] + nullPadding[:extraLength]

1622

diffs[-1] = (diffs[-1][0], newText)

1623

patch.length1 += extraLength

1624

patch.length2 += extraLength

1625

1626

return nullPadding

1627

1628

def patch_splitMax(self, patches):

1629

"""Look through the patches and break up any which are longer than the

1630

maximum limit of the match algorithm.

1631

1632

Args:

1633

patches: Array of patch objects.

1634

"""

1635

if self.Match_MaxBits == 0:

1636

return

1637

for x in xrange(len(patches)):

1638

if patches[x].length1 > self.Match_MaxBits:

1639

bigpatch = patches[x]

1640

# Remove the big old patch.

1641

del patches[x]

1642

x -= 1

1643

patch_size = self.Match_MaxBits

1644

start1 = bigpatch.start1

1645

start2 = bigpatch.start2

1646

precontext = ''

1647

while len(bigpatch.diffs) != 0:

1648

# Create one of several smaller patches.

1649

patch = patch_obj()

1650

empty = True

1651

patch.start1 = start1 - len(precontext)

1652

patch.start2 = start2 - len(precontext)

1653

if precontext:

1654

patch.length1 = patch.length2 = len(precontext)

1655

patch.diffs.append((self.DIFF_EQUAL, precontext))

1656

1657

while (len(bigpatch.diffs) != 0 and

1658

patch.length1 < patch_size - self.Patch_Margin):

1659

(diff_type, diff_text) = bigpatch.diffs[0]

1660

if diff_type == self.DIFF_INSERT:

1661

# Insertions are harmless.

1662

patch.length2 += len(diff_text)

1663

start2 += len(diff_text)

1664

patch.diffs.append(bigpatch.diffs.pop(0))

1665

empty = False

1666

elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and

1667

patch.diffs[0][0] == self.DIFF_EQUAL and

1668

len(diff_text) > 2 * patch_size):

1669

# This is a large deletion. Let it pass in one chunk.

1670

patch.length1 += len(diff_text)

1671

start1 += len(diff_text)

1672

empty = False

1673

patch.diffs.append((diff_type, diff_text))

1674

del bigpatch.diffs[0]

1675

else:

1676

# Deletion or equality. Only take as much as we can stomach.

1677

diff_text = diff_text[:patch_size - patch.length1 -

1678

self.Patch_Margin]

1679

patch.length1 += len(diff_text)

1680

start1 += len(diff_text)

1681

if diff_type == self.DIFF_EQUAL:

1682

patch.length2 += len(diff_text)

1683

start2 += len(diff_text)

1684

else:

1685

empty = False

1686

1687

patch.diffs.append((diff_type, diff_text))

1688

if diff_text == bigpatch.diffs[0][1]:

1689

del bigpatch.diffs[0]

1690

else:

1691

bigpatch.diffs[0] = (bigpatch.diffs[0][0],

1692

bigpatch.diffs[0][1][len(diff_text):])

1693

1694

# Compute the head context for the next patch.

1695

precontext = self.diff_text2(patch.diffs)

1696

precontext = precontext[-self.Patch_Margin:]

1697

# Append the end context for this patch.

1698

postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin]

1699

if postcontext:

1700

patch.length1 += len(postcontext)

1701

patch.length2 += len(postcontext)

1702

if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:

1703

patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +

1704

postcontext)

1705

else:

1706

patch.diffs.append((self.DIFF_EQUAL, postcontext))

1707

1708

if not empty:

1709

x += 1

1710

patches.insert(x, patch)

1711

1712

def patch_toText(self, patches):

1713

"""Take a list of patches and return a textual representation.

1714

1715

Args:

1716

patches: Array of patch objects.

1717

1718

Returns:

1719

Text representation of patches.

1720

"""

1721

text = []

1722

for patch in patches:

1723

text.append(str(patch))

1724

return "".join(text)

1725

1726

def patch_fromText(self, textline):

1727

"""Parse a textual representation of patches and return a list of patch

1728

objects.

1729

1730

Args:

1731

textline: Text representation of patches.

1732

1733

Returns:

1734

Array of patch objects.

1735

1736

Raises:

1737

ValueError: If invalid input.

1738

"""

1739

if type(textline) == unicode:

1740

# Patches should be composed of a subset of ascii chars, Unicode not

1741

# required. If this encode raises UnicodeEncodeError, patch is invalid.

1742

textline = textline.encode("ascii")

1743

patches = []

1744

if not textline:

1745

return patches

1746

text = textline.split('\n')

1747

while len(text) != 0:

1748

m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])

1749

if not m:

1750

raise ValueError, "Invalid patch string: " + text[0]

1751

patch = patch_obj()

1752

patches.append(patch)

1753

patch.start1 = int(m.group(1))

1754

if m.group(2) == '':

1755

patch.start1 -= 1

1756

patch.length1 = 1

1757

elif m.group(2) == '0':

1758

patch.length1 = 0

1759

else:

1760

patch.start1 -= 1

1761

patch.length1 = int(m.group(2))

1762

1763

patch.start2 = int(m.group(3))

1764

if m.group(4) == '':

1765

patch.start2 -= 1

1766

patch.length2 = 1

1767

elif m.group(4) == '0':

1768

patch.length2 = 0

1769

else:

1770

patch.start2 -= 1

1771

patch.length2 = int(m.group(4))

1772

1773

del text[0]

1774

1775

while len(text) != 0:

1776

if text[0]:

1777

sign = text[0][0]

1778

else:

1779

sign = ''

1780

line = urllib.unquote(text[0][1:])

1781

line = line.decode("utf-8")

1782

if sign == '+':

1783

# Insertion.

1784

patch.diffs.append((self.DIFF_INSERT, line))

1785

elif sign == '-':

1786

# Deletion.

1787

patch.diffs.append((self.DIFF_DELETE, line))

1788

elif sign == ' ':

1789

# Minor equality.

1790

patch.diffs.append((self.DIFF_EQUAL, line))

1791

elif sign == '@':

1792

# Start of next patch.

1793

break

1794

elif sign == '':

1795

# Blank line? Whatever.

1796

pass

1797

else:

1798

# WTF?

1799

raise ValueError, "Invalid patch mode: '%s'\n%s" % (sign, line)

1800

del text[0]

1801

return patches

1526

self.patch_addContext(patch, prepatch_text)

1527

patches.append(patch)

1528

return patches

1529

1530

def patch_deepCopy(self, patches):

1531

"""Given an array of patches, return another array that is identical.

1532

1533

Args:

1534

patches: Array of patch objects.

1535

1536

Returns:

1537

Array of patch objects.

1538

1539

"""

1540

patchesCopy = []

1541

for patch in patches:

1542

patchCopy = patch_obj()

1543

# No need to deep copy the tuples since they are immutable.

1544

patchCopy.diffs = patch.diffs[:]

1545

patchCopy.start1 = patch.start1

1546

patchCopy.start2 = patch.start2

1547

patchCopy.length1 = patch.length1

1548

patchCopy.length2 = patch.length2

1549

patchesCopy.append(patchCopy)

1550

return patchesCopy

1551

1552

def patch_apply(self, patches, text):

1553

"""Merge a set of patches onto the text. Return a patched text, as

1554

well as a list of true/false values indicating which patches were

1555

applied.

1556

1557

Args:

1558

patches: Array of patch objects.

1559

text: Old text.

1560

1561

Returns:

1562

Two element Array, containing the new text and an array of boolean values.

1563

1564

"""

1565

if not patches:

1566

return (text, [])

1567

1568

# Deep copy the patches so that no changes are made to originals.

1569

patches = self.patch_deepCopy(patches)

1570

1571

nullPadding = self.patch_addPadding(patches)

1572

text = nullPadding + text + nullPadding

1573

self.patch_splitMax(patches)

1574

1575

# delta keeps track of the offset between the expected and actual location

1576

# of the previous patch. If there are patches expected at positions 10 and

1577

# 20, but the first patch was found at 12, delta is 2 and the second patch

1578

# has an effective expected position of 22.

1579

delta = 0

1580

results = []

1581

for patch in patches:

1582

expected_loc = patch.start2 + delta

1583

text1 = self.diff_text1(patch.diffs)

1584

end_loc = -1

1585

if len(text1) > self.Match_MaxBits:

1586

# patch_splitMax will only provide an oversized pattern in the case of

1587

# a monster delete.

1588

start_loc = self.match_main(text, text1[:self.Match_MaxBits],

1589

expected_loc)

1590

if start_loc != -1:

1591

end_loc = self.match_main(text, text1[-self.Match_MaxBits:],

1592

expected_loc + len(text1) - self.Match_MaxBits)

1593

if end_loc == -1 or start_loc >= end_loc:

1594

# Can't find valid trailing context. Drop this patch.

1595

start_loc = -1

1596

else:

1597

start_loc = self.match_main(text, text1, expected_loc)

1598

if start_loc == -1:

1599

# No match found. :(

1600

results.append(False)

1601

# Subtract the delta for this failed patch from subsequent

1602

# patches.

1603

delta -= patch.length2 - patch.length1

1604

else:

1605

# Found a match. :)

1606

results.append(True)

1607

delta = start_loc - expected_loc

1608

if end_loc == -1:

1609

text2 = text[start_loc: start_loc + len(text1)]

1610

else:

1611

text2 = text[start_loc: end_loc + self.Match_MaxBits]

1612

if text1 == text2:

1613

# Perfect match, just shove the replacement text in.

1614

text = (text[:start_loc] + self.diff_text2(patch.diffs) +

1615

text[start_loc + len(text1):])

1616

else:

1617

# Imperfect match.

1618

# Run a diff to get a framework of equivalent indices.

1619

diffs = self.diff_main(text1, text2, False)

1620

if (len(text1) > self.Match_MaxBits and

1621

self.diff_levenshtein(diffs) / float(len(text1)) >

1622

self.Patch_DeleteThreshold):

1623

# The end points match, but the content is unacceptably

1624

# bad.

1625

results[-1] = False

1626

else:

1627

self.diff_cleanupSemanticLossless(diffs)

1628

index1 = 0

1629

for (op, data) in patch.diffs:

1630

if op != self.DIFF_EQUAL:

1631

index2 = self.diff_xIndex(diffs, index1)

1632

if op == self.DIFF_INSERT: # Insertion

1633

text = text[:start_loc + index2] + data + text[start_loc +

1634

index2:]

1635

elif op == self.DIFF_DELETE: # Deletion

1636

text = text[:start_loc + index2] + text[start_loc +

1637

self.diff_xIndex(diffs, index1 + len(data)):]

1638

if op != self.DIFF_DELETE:

1639

index1 += len(data)

1640

# Strip the padding off.

1641

text = text[len(nullPadding):-len(nullPadding)]

1642

return (text, results)

1643

1644

def patch_addPadding(self, patches):

1645

"""Add some padding on text start and end so that edges can match

1646

something. Intended to be called only from within patch_apply.

1647

1648

Args:

1649

patches: Array of patch objects.

1650

1651

Returns:

1652

The padding string added to each side.

1653

1654

"""

1655

paddingLength = self.Patch_Margin

1656

nullPadding = ''

1657

for x in xrange(1, paddingLength + 1):

1658

nullPadding += chr(x)

1659

1660

# Bump all the patches forward.

1661

for patch in patches:

1662

patch.start1 += paddingLength

1663

patch.start2 += paddingLength

1664

1665

# Add some padding on start of first diff.

1666

patch = patches[0]

1667

diffs = patch.diffs

1668

if not diffs or diffs[0][0] != self.DIFF_EQUAL:

1669

# Add nullPadding equality.

1670

diffs.insert(0, (self.DIFF_EQUAL, nullPadding))

1671

patch.start1 -= paddingLength # Should be 0.

1672

patch.start2 -= paddingLength # Should be 0.

1673

patch.length1 += paddingLength

1674

patch.length2 += paddingLength

1675

elif paddingLength > len(diffs[0][1]):

1676

# Grow first equality.

1677

extraLength = paddingLength - len(diffs[0][1])

1678

newText = nullPadding[len(diffs[0][1]):] + diffs[0][1]

1679

diffs[0] = (diffs[0][0], newText)

1680

patch.start1 -= extraLength

1681

patch.start2 -= extraLength

1682

patch.length1 += extraLength

1683

patch.length2 += extraLength

1684

1685

# Add some padding on end of last diff.

1686

patch = patches[-1]

1687

diffs = patch.diffs

1688

if not diffs or diffs[-1][0] != self.DIFF_EQUAL:

1689

# Add nullPadding equality.

1690

diffs.append((self.DIFF_EQUAL, nullPadding))

1691

patch.length1 += paddingLength

1692

patch.length2 += paddingLength

1693

elif paddingLength > len(diffs[-1][1]):

1694

# Grow last equality.

1695

extraLength = paddingLength - len(diffs[-1][1])

1696

newText = diffs[-1][1] + nullPadding[:extraLength]

1697

diffs[-1] = (diffs[-1][0], newText)

1698

patch.length1 += extraLength

1699

patch.length2 += extraLength

1700

1701

return nullPadding

1702

1703

def patch_splitMax(self, patches):

1704

"""Look through the patches and break up any which are longer than the

1705

maximum limit of the match algorithm.

1706

1707

Args:

1708

patches: Array of patch objects.

1709

1710

"""

1711

if self.Match_MaxBits == 0:

1712

return

1713

for x in xrange(len(patches)):

1714

if patches[x].length1 > self.Match_MaxBits:

1715

bigpatch = patches[x]

1716

# Remove the big old patch.

1717

del patches[x]

1718

x -= 1

1719

patch_size = self.Match_MaxBits

1720

start1 = bigpatch.start1

1721

start2 = bigpatch.start2

1722

precontext = ''

1723

while len(bigpatch.diffs) != 0:

1724

# Create one of several smaller patches.

1725

patch = patch_obj()

1726

empty = True

1727

patch.start1 = start1 - len(precontext)

1728

patch.start2 = start2 - len(precontext)

1729

if precontext:

1730

patch.length1 = patch.length2 = len(precontext)

1731

patch.diffs.append((self.DIFF_EQUAL, precontext))

1732

1733

while (len(bigpatch.diffs) != 0 and

1734

patch.length1 < patch_size - self.Patch_Margin):

1735

(diff_type, diff_text) = bigpatch.diffs[0]

1736

if diff_type == self.DIFF_INSERT:

1737

# Insertions are harmless.

1738

patch.length2 += len(diff_text)

1739

start2 += len(diff_text)

1740

patch.diffs.append(bigpatch.diffs.pop(0))

1741

empty = False

1742

elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and

1743

patch.diffs[0][0] == self.DIFF_EQUAL and

1744

len(diff_text) > 2 * patch_size):

1745

# This is a large deletion. Let it pass in one

1746

# chunk.

1747

patch.length1 += len(diff_text)

1748

start1 += len(diff_text)

1749

empty = False

1750

patch.diffs.append((diff_type, diff_text))

1751

del bigpatch.diffs[0]

1752

else:

1753

# Deletion or equality. Only take as much as we

1754

# can stomach.

1755

diff_text = diff_text[:patch_size - patch.length1 -

1756

self.Patch_Margin]

1757

patch.length1 += len(diff_text)

1758

start1 += len(diff_text)

1759

if diff_type == self.DIFF_EQUAL:

1760

patch.length2 += len(diff_text)

1761

start2 += len(diff_text)

1762

else:

1763

empty = False

1764

1765

patch.diffs.append((diff_type, diff_text))

1766

if diff_text == bigpatch.diffs[0][1]:

1767

del bigpatch.diffs[0]

1768

else:

1769

bigpatch.diffs[0] = (bigpatch.diffs[0][0],

1770

bigpatch.diffs[0][1][len(diff_text):])

1771

1772

# Compute the head context for the next patch.

1773

precontext = self.diff_text2(patch.diffs)

1774

precontext = precontext[-self.Patch_Margin:]

1775

# Append the end context for this patch.

1776

postcontext = self.diff_text1(bigpatch.diffs)[

1777

:self.Patch_Margin]

1778

if postcontext:

1779

patch.length1 += len(postcontext)

1780

patch.length2 += len(postcontext)

1781

if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:

1782

patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] +

1783

postcontext)

1784

else:

1785

patch.diffs.append((self.DIFF_EQUAL, postcontext))

1786

1787

if not empty:

1788

x += 1

1789

patches.insert(x, patch)

1790

1791

def patch_toText(self, patches):

1792

"""Take a list of patches and return a textual representation.

1793

1794

Args:

1795

patches: Array of patch objects.

1796

1797

Returns:

1798

Text representation of patches.

1799

1800

"""

1801

text = []

1802

for patch in patches:

1803

text.append(str(patch))

1804

return ''.join(text)

1805

1806

def patch_fromText(self, textline):

1807

"""Parse a textual representation of patches and return a list of patch

1808

objects.

1809

1810

Args:

1811

textline: Text representation of patches.

1812

1813

Returns:

1814

Array of patch objects.

1815

1816

Raises:

1817

ValueError: If invalid input.

1818

1819

"""

1820

if type(textline) == unicode:

1821

# Patches should be composed of a subset of ascii chars, Unicode not

1822

# required. If this encode raises UnicodeEncodeError, patch is

1823

# invalid.

1824

textline = textline.encode('ascii')

1825

patches = []

1826

if not textline:

1827

return patches

1828

text = textline.split('\n')

1829

while len(text) != 0:

1830

m = re.match('^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$', text[0])

1831

if not m:

1832

raise ValueError, 'Invalid patch string: ' + text[0]

1833

patch = patch_obj()

1834

patches.append(patch)

1835

patch.start1 = int(m.group(1))

1836

if m.group(2) == '':

1837

patch.start1 -= 1

1838

patch.length1 = 1

1839

elif m.group(2) == '0':

1840

patch.length1 = 0

1841

else:

1842

patch.start1 -= 1

1843

patch.length1 = int(m.group(2))

1844

1845

patch.start2 = int(m.group(3))

1846

if m.group(4) == '':

1847

patch.start2 -= 1

1848

patch.length2 = 1

1849

elif m.group(4) == '0':

1850

patch.length2 = 0

1851

else:

1852

patch.start2 -= 1

1853

patch.length2 = int(m.group(4))

1854

1855

del text[0]

1856

1857

while len(text) != 0:

1858

if text[0]:

1859

sign = text[0][0]

1860

else:

1861

sign = ''

1862

line = urllib.unquote(text[0][1:])

1863

line = line.decode('utf-8')

1864

if sign == '+':

1865

# Insertion.

1866

patch.diffs.append((self.DIFF_INSERT, line))

1867

elif sign == '-':

1868

# Deletion.

1869

patch.diffs.append((self.DIFF_DELETE, line))

1870

elif sign == ' ':

1871

# Minor equality.

1872

patch.diffs.append((self.DIFF_EQUAL, line))

1873

elif sign == '@':

1874

# Start of next patch.

1875

break

1876

elif sign == '':

1877

# Blank line? Whatever.

1878

pass

1879

else:

1880

# WTF?

1881

raise ValueError, "Invalid patch mode: '%s'\n%s" % (sign, line)

1882

del text[0]

1883

return patches

1802

1884

1803

1885

1804

1886

class patch_obj:

1805

"""Class representing one patch operation.

1806

"""

1807

1808

def __init__(self):

1809

"""Initializes with an empty list of diffs.

1810

"""

1811

self.diffs = []

1812

self.start1 = None

1813

self.start2 = None

1814

self.length1 = 0

1815

self.length2 = 0

1816

1817

def __str__(self):

1818

"""Emmulate GNU diff's format.

1819

Header: @@ -382,8 +481,9 @@

1820

Indicies are printed as 1-based, not 0-based.

1821

1822

Returns:

1823

The GNU diff string.

1824

"""

1825

if self.length1 == 0:

1826

coords1 = str(self.start1) + ",0"

1827

elif self.length1 == 1:

1828

coords1 = str(self.start1 + 1)

1829

else:

1830

coords1 = str(self.start1 + 1) + "," + str(self.length1)

1831

if self.length2 == 0:

1832

coords2 = str(self.start2) + ",0"

1833

elif self.length2 == 1:

1834

coords2 = str(self.start2 + 1)

1835

else:

1836

coords2 = str(self.start2 + 1) + "," + str(self.length2)

1837

text = ["@@ -", coords1, " +", coords2, " @@\n"]

1838

# Escape the body of the patch with %xx notation.

1839

for (op, data) in self.diffs:

1840

if op == diff_match_patch.DIFF_INSERT:

1841

text.append("+")

1842

elif op == diff_match_patch.DIFF_DELETE:

1843

text.append("-")

1844

elif op == diff_match_patch.DIFF_EQUAL:

1845

text.append(" ")

1846

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

1847

data = data.encode("utf-8")

1848

text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n")

1849

return "".join(text)

1887

"""Class representing one patch operation."""

1888

1889

def __init__(self):

1890

"""Initializes with an empty list of diffs."""

1891

self.diffs = []

1892

self.start1 = None

1893

self.start2 = None

1894

self.length1 = 0

1895

self.length2 = 0

1896

1897

def __str__(self):

1898

"""Emmulate GNU diff's format.

1899

Header: @@ -382,8 +481,9 @@

1900

Indicies are printed as 1-based, not 0-based.

1901

1902

Returns:

1903

The GNU diff string.

1904

"""

1905

if self.length1 == 0:

1906

coords1 = str(self.start1) + ',0'

1907

elif self.length1 == 1:

1908

coords1 = str(self.start1 + 1)

1909

else:

1910

coords1 = str(self.start1 + 1) + ',' + str(self.length1)

1911

if self.length2 == 0:

1912

coords2 = str(self.start2) + ',0'

1913

elif self.length2 == 1:

1914

coords2 = str(self.start2 + 1)

1915

else:

1916

coords2 = str(self.start2 + 1) + ',' + str(self.length2)

1917

text = ['@@ -', coords1, ' +', coords2, ' @@\n']

1918

# Escape the body of the patch with %xx notation.

1919

for (op, data) in self.diffs:

1920

if op == diff_match_patch.DIFF_INSERT:

1921

text.append('+')

1922

elif op == diff_match_patch.DIFF_DELETE:

1923

text.append('-')

1924

elif op == diff_match_patch.DIFF_EQUAL:

1925

text.append(' ')

1926

# High ascii will raise UnicodeDecodeError. Use Unicode instead.

1927

data = data.encode('utf-8')

1928

text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + '\n')

1929

return ''.join(text)