~mnordhoff/loggerhead/statictuples_and_jam

31

import bisect

32

import datetime

33

import logging

34

import os

34

35

import re

35

36

import textwrap

36

37

import threading

37

38

39

from bzrlib import lru_cache

38

40

import bzrlib.branch

39

41

import bzrlib.delta

40

42

import bzrlib.errors

45

47

from loggerhead import util

46

48

from loggerhead.wholehistory import compute_whole_history_data

47

49

50

from bzrlib.plugins.history_db import (

51

history_db,

52

_get_history_db_path,

53

_get_querier,

54

)

55

48

56

49

57

def is_branch(folder):

50

58

try:

200

208

finally:

201

209

self._lock.release()

202

210

211

212

_raw_revno_revid_cache = lru_cache.LRUCache(10000)

213

_revno_revid_lock = threading.RLock()

214

215

216

class RevnoRevidMemoryCache(object):

217

"""A store that maps revnos to revids based on the branch it is in.

218

"""

219

220

def __init__(self, cache, lock, branch_tip):

221

# Note: what we'd really like is something that knew how long it takes

222

# to produce a revno * how often it is accessed. Since some revnos

223

# take 100x longer to produce than others. Could we cheat and just loop

224

# on __getitem__ ?

225

# There are also other possible layouts. A per-branch cache, with an

226

# LRU around the whole thing, etc. I chose this for simplicity.

227

self._branch_tip = branch_tip

228

self._cache = cache

229

# lru_cache is not thread-safe, so we need to lock all accesses.

230

# It is even modified when doing a get() on it.

231

self._lock = lock

232

233

def get(self, key):

234

"""Return the data associated with `key`.

235

Otherwise return None.

236

237

:param key: Can be a revno_str or a revid.

238

"""

239

self._lock.acquire()

240

try:

241

cached = self._cache.get((self._branch_tip, key))

242

finally:

243

self._lock.release()

244

return cached

245

246

def set(self, revid, revno_str):

247

"""Store `data` under `key`.

248

"""

249

self._lock.acquire()

250

try:

251

# TODO: StaticTuples ? Probably only useful if we cache more than

252

# 10k of them. 100k/1M is probably useful.

253

self._cache[(self._branch_tip, revid)] = revno_str

254

self._cache[(self._branch_tip, revno_str)] = revid

255

finally:

256

self._lock.release()

257

203

258

# Used to store locks that prevent multiple threads from building a

204

259

# revision graph for the same branch at the same time, because that can

205

260

# cause severe performance issues that are so bad that the system seems

206

261

# to hang.

207

262

revision_graph_locks = {}

208

263

revision_graph_check_lock = threading.Lock()

264

history_db_importer_lock = threading.Lock()

209

265

210

266

class History(object):

211

267

"""Decorate a branch to provide information for rendering.

217

273

218

274

:ivar _file_change_cache: An object that caches information about the

219

275

files that changed between two revisions.

220

:ivar _rev_info: A list of information about revisions. This is by far

221

the most cryptic data structure in loggerhead. At the top level, it

222

is a list of 3-tuples [(merge-info, where-merged, parents)].

223

`merge-info` is (seq, revid, merge_depth, revno_str, end_of_merge) --

224

like a merged sorted list, but the revno is stringified.

225

`where-merged` is a tuple of revisions that have this revision as a

226

non-lefthand parent. Finally, `parents` is just the usual list of

227

parents of this revision.

228

:ivar _rev_indices: A dictionary mapping each revision id to the index of

229

the information about it in _rev_info.

276

:ivar _querier: A HistoryDB.Querier instance, allowing us to query for

277

information in the ancestry of the branch.

230

278

:ivar _revno_revid: A dictionary mapping stringified revnos to revision

231

279

ids.

232

280

"""

233

281

234

def _load_whole_history_data(self, caches, cache_key):

235

"""Set the attributes relating to the whole history of the branch.

236

237

:param caches: a list of caches with interfaces like

238

`RevInfoMemoryCache` and be ordered from fastest to slowest.

239

:param cache_key: the key to use with the caches.

240

"""

241

self._rev_indices = None

242

self._rev_info = None

243

244

missed_caches = []

245

def update_missed_caches():

246

for cache in missed_caches:

247

cache.set(cache_key, self.last_revid, self._rev_info)

248

249

# Theoretically, it's possible for two threads to race in creating

250

# the Lock() object for their branch, so we put a lock around

251

# creating the per-branch Lock().

252

revision_graph_check_lock.acquire()

253

try:

254

if cache_key not in revision_graph_locks:

255

revision_graph_locks[cache_key] = threading.Lock()

256

finally:

257

revision_graph_check_lock.release()

258

259

revision_graph_locks[cache_key].acquire()

260

try:

261

for cache in caches:

262

data = cache.get(cache_key, self.last_revid)

263

if data is not None:

264

self._rev_info = data

265

update_missed_caches()

266

break

267

else:

268

missed_caches.append(cache)

269

else:

270

whole_history_data = compute_whole_history_data(self._branch)

271

self._rev_info, self._rev_indices = whole_history_data

272

update_missed_caches()

273

finally:

274

revision_graph_locks[cache_key].release()

275

276

if self._rev_indices is not None:

277

self._revno_revid = {}

278

for ((_, revid, _, revno_str, _), _, _) in self._rev_info:

279

self._revno_revid[revno_str] = revid

280

else:

281

self._revno_revid = {}

282

self._rev_indices = {}

283

for ((seq, revid, _, revno_str, _), _, _) in self._rev_info:

284

self._rev_indices[revid] = seq

285

self._revno_revid[revno_str] = revid

286

287

282

def __init__(self, branch, whole_history_data_cache, file_cache=None,

288

revinfo_disk_cache=None, cache_key=None):

283

revinfo_disk_cache=None, cache_key=None,

284

show_merge_points=True,

285

cache_path=None):

289

286

assert branch.is_locked(), (

290

287

"Can only construct a History object with a read-locked branch.")

291

288

if file_cache is not None:

296

293

self._branch = branch

297

294

self._branch_tags = None

298

295

self._inventory_cache = {}

296

# Map from (tip_revision, revision_id) => revno_str

297

# and from (tip_revisino, revno_str) => revision_id

298

self._querier = _get_querier(branch)

299

if self._querier is None:

300

assert cache_path is not None

301

self._querier = history_db.Querier(

302

os.path.join(cache_path, 'historydb.sql'), branch)

303

# History-db is not configured for this branch, do it ourselves

304

# sqlite is single-writer, so block concurrant updates.

305

# Note that this was even done in the past because of perf issues, even

306

# without a disk requirement.

307

self._querier.set_importer_lock(history_db_importer_lock)

308

# TODO: Is this being premature? It makes the rest of the code

309

# simpler...

310

self._querier.ensure_branch_tip()

299

311

self._branch_nick = self._branch.get_config().get_nickname()

300

312

self.log = logging.getLogger('loggerhead.%s' % (self._branch_nick,))

301

313

302

314

self.last_revid = branch.last_revision()

303

304

caches = [RevInfoMemoryCache(whole_history_data_cache)]

305

if revinfo_disk_cache:

306

caches.append(revinfo_disk_cache)

307

self._load_whole_history_data(caches, cache_key)

315

self._revno_revid_cache = RevnoRevidMemoryCache(_raw_revno_revid_cache,

316

_revno_revid_lock, self._branch.last_revision())

308

317

309

318

@property

310

319

def has_revisions(self):

314

323

return self._branch.get_config()

315

324

316

325

def get_revno(self, revid):

317

if revid not in self._rev_indices:

318

# ghost parent?

326

if revid is None:

319

327

return 'unknown'

320

seq = self._rev_indices[revid]

321

revno = self._rev_info[seq][0][3]

322

return revno

328

revno_str = self._revno_revid_cache.get(revid)

329

if revno_str is not None:

330

return revno_str

331

revnos = self._querier.get_dotted_revno_range_multi([revid])

332

# TODO: Should probably handle KeyError?

333

dotted_revno = revnos[revid]

334

revno_str = '.'.join(map(str, dotted_revno))

335

self._revno_revid_cache.set(revid, revno_str)

336

return revno_str

337

338

def get_revnos(self, revids):

339

"""Get a map of revid => revno for all revisions."""

340

revno_map = {}

341

unknown = []

342

for revid in revids:

343

if revid is None:

344

revno_map[revid] = 'unknown'

345

continue

346

revno_str = self._revno_revid_cache.get(revid)

347

if revno_str is not None:

348

revno_map[revid] = revno_str

349

continue

350

unknown.append(revid)

351

if not unknown:

352

return revno_map

353

# querier returns dotted revno tuples

354

query_revno_map = self._querier.get_dotted_revno_range_multi(

355

unknown)

356

ghosts = set(unknown)

357

for revid, dotted_revno in query_revno_map.iteritems():

358

revno_str = '.'.join(map(str, dotted_revno))

359

self._revno_revid_cache.set(revid, revno_str)

360

revno_map[revid] = revno_str

361

ghosts.discard(revid)

362

if ghosts:

363

revno_map.update([(n, 'unknown') for n in ghosts])

364

return revno_map

365

366

def get_revid_for_revno(self, revno_str):

367

revid = self._revno_revid_cache.get(revno_str)

368

if revid is not None:

369

return revid

370

dotted_revno = tuple(map(int, revno_str.split('.')))

371

revnos = self._querier.get_revision_ids([dotted_revno])

372

revnos = dict([('.'.join(map(str, drn)), ri)

373

for drn, ri in revnos.iteritems()])

374

for revno_str, revid in revnos.iteritems():

375

self._revno_revid_cache.set(revid, revno_str)

376

return revnos[revno_str]

377

378

def _get_lh_parent(self, revid):

379

"""Get the left-hand parent of a given revision id."""

380

# TODO: Move this into a public method on Querier

381

# TODO: Possibly look into caching some of this info in memory, and

382

# between HTTP requests.

383

self._querier.ensure_branch_tip()

384

return self._querier._get_lh_parent_rev_id(revid)

385

386

def _get_children(self, revid):

387

"""Get the children of the given revision id."""

388

# XXX: We should be filtering this based on self._branch's ancestry...

389

# TODO: We also should be using a method on Querier, instead of doing

390

# it ourselves

391

c = self._querier._get_cursor()

392

res = c.execute("SELECT c.revision_id"

393

" FROM revision p, parent, revision c"

394

" WHERE child = c.db_id"

395

" AND parent = p.db_id"

396

" AND p.revision_id = ?",

397

(revid,)).fetchall()

398

return [r[0] for r in res]

323

399

324

400

def get_revids_from(self, revid_list, start_revid):

325

401

"""

326

402

Yield the mainline (wrt start_revid) revisions that merged each

327

403

revid in revid_list.

328

404

"""

405

tip_revid = start_revid

329

406

if revid_list is None:

330

revid_list = [r[0][1] for r in self._rev_info]

407

# This returns the mainline of start_revid

408

# TODO: We could use Querier for this

409

# Note: Don't use self._branch.revision_history, as that always

410

# grabs the full history, and we now support stopping early.

411

history = self._branch.repository.iter_reverse_revision_history(

412

start_revid)

413

for revid in history:

414

yield revid

415

return

331

416

revid_set = set(revid_list)

332

revid = start_revid

333

417

334

def introduced_revisions(revid):

335

r = set([revid])

336

seq = self._rev_indices[revid]

337

md = self._rev_info[seq][0][2]

338

i = seq + 1

339

while i < len(self._rev_info) and self._rev_info[i][0][2] > md:

340

r.add(self._rev_info[i][0][1])

341

i += 1

342

return r

343

while True:

344

if bzrlib.revision.is_null(revid):

345

return

346

if introduced_revisions(revid) & revid_set:

347

yield revid

348

parents = self._rev_info[self._rev_indices[revid]][2]

349

if len(parents) == 0:

350

return

351

revid = parents[0]

418

while tip_revid is not None and revid_set:

419

parent_revid = self._get_lh_parent(tip_revid)

420

# TODO: Consider caching this, especially between HTTP requests

421

introduced = self._querier.iter_merge_sorted_revisions(

422

start_revision_id=tip_revid, stop_revision_id=parent_revid)

423

introduced_revs = set([i[0] for i in introduced

424

if i in revid_set])

425

if introduced_revs:

426

revid_set.difference_update(introduced_revs)

427

yield tip_revid

428

tip_revid = parent_revid

352

429

353

430

def get_short_revision_history_by_fileid(self, file_id):

354

431

# FIXME: would be awesome if we could get, for a folder, the list of

366

443

del possible_keys, next_keys

367

444

return revids

368

445

369

def get_revision_history_since(self, revid_list, date):

370

# if a user asks for revisions starting at 01-sep, they mean inclusive,

371

# so start at midnight on 02-sep.

372

date = date + datetime.timedelta(days=1)

373

# our revid list is sorted in REVERSE date order,

374

# so go thru some hoops here...

375

revid_list.reverse()

376

index = bisect.bisect(_RevListToTimestamps(revid_list,

377

self._branch.repository),

378

date)

379

if index == 0:

380

return []

381

revid_list.reverse()

382

index = -index

383

return revid_list[index:]

384

385

def get_search_revid_list(self, query, revid_list):

386

"""

387

given a "quick-search" query, try a few obvious possible meanings:

388

389

- revision id or # ("128.1.3")

390

- date (US style "mm/dd/yy", earth style "dd-mm-yy", or \

391

iso style "yyyy-mm-dd")

392

- comment text as a fallback

393

394

and return a revid list that matches.

395

"""

396

# FIXME: there is some silliness in this action. we have to look up

397

# all the relevant changes (time-consuming) only to return a list of

398

# revids which will be used to fetch a set of changes again.

399

400

# if they entered a revid, just jump straight there;

401

# ignore the passed-in revid_list

402

revid = self.fix_revid(query)

403

if revid is not None:

404

if isinstance(revid, unicode):

405

revid = revid.encode('utf-8')

406

changes = self.get_changes([revid])

407

if (changes is not None) and (len(changes) > 0):

408

return [revid]

409

410

date = None

411

m = self.us_date_re.match(query)

412

if m is not None:

413

date = datetime.datetime(util.fix_year(int(m.group(3))),

414

int(m.group(1)),

415

int(m.group(2)))

416

else:

417

m = self.earth_date_re.match(query)

418

if m is not None:

419

date = datetime.datetime(util.fix_year(int(m.group(3))),

420

int(m.group(2)),

421

int(m.group(1)))

422

else:

423

m = self.iso_date_re.match(query)

424

if m is not None:

425

date = datetime.datetime(util.fix_year(int(m.group(1))),

426

int(m.group(2)),

427

int(m.group(3)))

428

if date is not None:

429

if revid_list is None:

430

# if no limit to the query was given,

431

# search only the direct-parent path.

432

revid_list = list(self.get_revids_from(None, self.last_revid))

433

return self.get_revision_history_since(revid_list, date)

434

435

446

revno_re = re.compile(r'^[\d\.]+$')

436

# the date regex are without a final '$' so that queries like

437

# "2006-11-30 12:15" still mostly work. (i think it's better to give

438

# them 90% of what they want instead of nothing at all.)

439

us_date_re = re.compile(r'^(\d{1,2})/(\d{1,2})/(\d\d(\d\d?))')

440

earth_date_re = re.compile(r'^(\d{1,2})-(\d{1,2})-(\d\d(\d\d?))')

441

iso_date_re = re.compile(r'^(\d\d\d\d)-(\d\d)-(\d\d)')

442

447

443

448

def fix_revid(self, revid):

444

449

# if a "revid" is actually a dotted revno, convert it to a revid

448

453

return self.last_revid

449

454

try:

450

455

if self.revno_re.match(revid):

451

revid = self._revno_revid[revid]

456

val = self.get_revid_for_revno(revid)

457

# XXX: Do this more cleanly

458

if val is None:

459

raise KeyError

460

revid = val

452

461

except KeyError:

462

import pdb; pdb.set_trace()

453

463

raise bzrlib.errors.NoSuchRevision(self._branch_nick, revid)

454

464

return revid

455

465

464

474

if revid is None:

465

475

revid = self.last_revid

466

476

if file_id is not None:

467

# since revid is 'start_revid', possibly should start the path

468

# tracing from revid... FIXME

469

revlist = list(self.get_short_revision_history_by_fileid(file_id))

470

revlist = list(self.get_revids_from(revlist, revid))

477

revlist = list(

478

self.get_short_revision_history_by_fileid(file_id, revid))

479

revlist = self.get_revids_from(revlist, revid)

471

480

else:

472

revlist = list(self.get_revids_from(None, revid))

481

revlist = self.get_revids_from(None, revid)

473

482

return revlist

474

483

475

def get_view(self, revid, start_revid, file_id, query=None):

484

def _iterate_sufficiently(self, iterable, stop_at, extra_rev_count):

485

"""Return a list of iterable.

486

487

If extra_rev_count is None, fully consume iterable.

488

Otherwise, stop at 'stop_at' + extra_rev_count.

489

490

Example:

491

iterate until you find stop_at, then iterate 10 more times.

492

"""

493

if extra_rev_count is None:

494

return list(iterable)

495

result = []

496

found = False

497

for n in iterable:

498

result.append(n)

499

if n == stop_at:

500

found = True

501

break

502

if found:

503

for count, n in enumerate(iterable):

504

result.append(n)

505

if count >= extra_rev_count:

506

break

507

return result

508

509

def get_view(self, revid, start_revid, file_id, query=None,

510

extra_rev_count=None):

476

511

"""

477

512

use the URL parameters (revid, start_revid, file_id, and query) to

478

513

determine the revision list we're viewing (start_revid, file_id, query)

483

518

file.

484

519

- if a start_revid is given, we're viewing the branch from a

485

520

specific revision up the tree.

521

- if extra_rev_count is given, find the view from start_revid =>

522

revid, and continue an additional 'extra_rev_count'. If not

523

given, then revid_list will contain the full history of

524

start_revid

486

525

487

526

these may be combined to view revisions for a specific file, from

488

527

a specific revision, with a specific search query.

501

540

502

541

if query is None:

503

542

revid_list = self.get_file_view(start_revid, file_id)

543

revid_list = self._iterate_sufficiently(revid_list, revid,

544

extra_rev_count)

504

545

if revid is None:

505

546

revid = start_revid

506

547

if revid not in revid_list:

507

548

# if the given revid is not in the revlist, use a revlist that

508

549

# starts at the given revid.

509

550

revid_list = self.get_file_view(revid, file_id)

551

revid_list = self._iterate_sufficiently(revid_list, revid,

552

extra_rev_count)

510

553

start_revid = revid

511

554

return revid, start_revid, revid_list

512

555

528

571

529

572

def get_inventory(self, revid):

530

573

if revid not in self._inventory_cache:

574

# TODO: This cache is unbounded, though only used for a single http

575

# request. Consider what we might do to limit this.

531

576

self._inventory_cache[revid] = (

532

577

self._branch.repository.get_inventory(revid))

533

578

return self._inventory_cache[revid]

554

599

555

600

merge_point = []

556

601

while True:

557

children = self._rev_info[self._rev_indices[revid]][1]

602

children = self._get_children(revid)

558

603

nexts = []

559

604

for child in children:

560

child_parents = self._rev_info[self._rev_indices[child]][2]

561

if child_parents[0] == revid:

605

child_lh_parent = self._get_lh_parent(child)

606

if child_lh_parent == revid:

562

607

nexts.append(child)

563

608

else:

564

609

merge_point.append(child)

625

670

if len(changes) == 0:

626

671

return changes

627

672

628

# some data needs to be recalculated each time, because it may

629

# change as new revisions are added.

673

needed_revnos = set()

630

674

for change in changes:

675

needed_revnos.add(change.revid)

676

needed_revnos.update([p_id for p_id in change.parents])

677

revno_map = self.get_revnos(needed_revnos)

678

679

def merge_points_callback(a_change, attr):

631

680

merge_revids = self.simplify_merge_point_list(

632

self.get_merge_point_list(change.revid))

633

change.merge_points = [

634

util.Container(revid=r,

635

revno=self.get_revno(r)) for r in merge_revids]

681

self.get_merge_point_list(a_change.revid))

682

if not merge_revids:

683

return []

684

revno_map = self.get_revnos(merge_revids)

685

return [util.Container(revid=r, revno=revno_map[r])

686

for r in merge_revids]

687

688

parity = 0

689

for change in changes:

690

change._set_property('merge_points', merge_points_callback)

636

691

if len(change.parents) > 0:

637

change.parents = [util.Container(revid=r,

638

revno=self.get_revno(r)) for r in change.parents]

639

change.revno = self.get_revno(change.revid)

640

641

parity = 0

642

for change in changes:

692

change.parents = [util.Container(revid=r, revno=revno_map[r])

693

for r in change.parents]

694

change.revno = revno_map[change.revid]

643

695

change.parity = parity

644

696

parity ^= 1

645

697

649

701

# FIXME: deprecated method in getting a null revision

650

702

revid_list = filter(lambda revid: not bzrlib.revision.is_null(revid),

651

703

revid_list)

652

parent_map = self._branch.repository.get_graph().get_parent_map(

704

parent_map = self._branch.repository.get_parent_map(

653

705

revid_list)

654

706

# We need to return the answer in the same order as the input,

655

707

# less any ghosts.

716

768

else:

717

769

return self._file_change_cache.get_file_changes(entry)

718

770

771

def get_merged_in(self, entry):

772

"""Get the point where this entry was merged into the mainline.

773

774

:param entry: A Container having .revno and .revid.

775

:return: The revno string of the mainline revision.

776

"""

777

if '.' not in entry.revno:

778

return None

779

rev_id_to_mainline = self._querier.get_mainline_where_merged(

780

[entry.revid])

781

revid = rev_id_to_mainline.get(entry.revid, None)

782

if revid is None:

783

return None

784

return self.get_revno(revid)

785

719

786

def add_changes(self, entry):

720

787

changes = self.get_file_changes(entry)

721

788

entry.changes = changes

~mnordhoff/loggerhead/statictuples_and_jam_integration