~ubuntu-branches/ubuntu/utopic/bzr-fastimport/utopic-proposed : revision 1

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License as published by

5

# the Free Software Foundation; either version 2 of the License, or

6

# (at your option) any later version.

7

#

8

# This program is distributed in the hope that it will be useful,

9

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# GNU General Public License for more details.

12

#

13

# You should have received a copy of the GNU General Public License

14

# along with this program; if not, write to the Free Software

15

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

16

17

"""Import processor that supports all Bazaar repository formats."""

18

19

20

import re

21

import time

22

from bzrlib import (

23

builtins,

24

bzrdir,

25

debug,

26

delta,

27

errors,

28

generate_ids,

29

inventory,

30

lru_cache,

31

osutils,

32

progress,

33

revision,

34

revisiontree,

35

transport,

36

)

37

from bzrlib.repofmt import pack_repo

38

from bzrlib.trace import (

39

error,

40

mutter,

41

note,

42

warning,

43

)

44

import bzrlib.util.configobj.configobj as configobj

45

from bzrlib.plugins.fastimport import (

46

errors as plugin_errors,

47

helpers,

48

idmapfile,

49

processor,

50

revisionloader,

51

)

52

53

54

# How many commits before automatically reporting progress

55

_DEFAULT_AUTO_PROGRESS = 1000

56

57

# How many commits before automatically checkpointing

58

_DEFAULT_AUTO_CHECKPOINT = 10000

59

60

# How many inventories to cache

61

_DEFAULT_INV_CACHE_SIZE = 10

62

63

64

class GenericProcessor(processor.ImportProcessor):

65

"""An import processor that handles basic imports.

66

67

Current features supported:

68

69

* blobs are cached in memory

70

* files and symlinks commits are supported

71

* checkpoints automatically happen at a configurable frequency

72

over and above the stream requested checkpoints

73

* timestamped progress reporting, both automatic and stream requested

74

* LATER: reset support, tags for each branch

75

* some basic statistics are dumped on completion.

76

77

At checkpoints and on completion, the commit-id -> revision-id map is

78

saved to a file called 'fastimport-id-map'. If the import crashes

79

or is interrupted, it can be started again and this file will be

80

used to skip over already loaded revisions. The format of each line

81

is "commit-id revision-id" so commit-ids cannot include spaces.

82

83

Here are the supported parameters:

84

85

* info - name of a hints file holding the analysis generated

86

by running the fast-import-info processor in verbose mode. When

87

importing large repositories, this parameter is needed so

88

that the importer knows what blobs to intelligently cache.

89

90

* trees - update the working trees before completing.

91

By default, the importer updates the repository

92

and branches and the user needs to run 'bzr update' for the

93

branches of interest afterwards.

94

95

* checkpoint - automatically checkpoint every n commits over and

96

above any checkpoints contained in the import stream.

97

The default is 10000.

98

99

* count - only import this many commits then exit. If not set

100

or negative, all commits are imported.

101

102

* inv-cache - number of inventories to cache.

103

If not set, the default is 10.

104

105

* experimental - enable experimental mode, i.e. use features

106

not yet fully tested.

107

108

* import-marks - name of file to read to load mark information from

109

110

* export-marks - name of file to write to save mark information to

111

"""

112

113

known_params = [

114

'info',

115

'trees',

116

'checkpoint',

117

'count',

118

'inv-cache',

119

'experimental',

120

'import-marks',

121

'export-marks',

122

]

123

124

def note(self, msg, *args):

125

"""Output a note but timestamp it."""

126

msg = "%s %s" % (self._time_of_day(), msg)

127

note(msg, *args)

128

129

def warning(self, msg, *args):

130

"""Output a warning but timestamp it."""

131

msg = "%s WARNING: %s" % (self._time_of_day(), msg)

132

warning(msg, *args)

133

134

def debug(self, mgs, *args):

135

"""Output a debug message if the appropriate -D option was given."""

136

if "fast-import" in debug.debug_flags:

137

msg = "%s DEBUG: %s" % (self._time_of_day(), msg)

138

mutter(msg, *args)

139

140

def _time_of_day(self):

141

"""Time of day as a string."""

142

# Note: this is a separate method so tests can patch in a fixed value

143

return time.strftime("%H:%M:%S")

144

145

def _import_marks(self, filename):

146

try:

147

f = file(filename)

148

except IOError:

149

self.warning(

150

"Could not open import-marks file, not importing marks")

151

return

152

153

firstline = f.readline()

154

match = re.match(r'^format=(\d+)$', firstline)

155

if not match:

156

print >>sys.stderr, "%r doesn't look like a mark file" % \

157

(filename,)

158

sys.exit(1)

159

elif match.group(1) != '1':

160

print >>sys.stderr, 'format version in mark file not supported'

161

sys.exit(1)

162

163

for string in f.readline().rstrip('\n').split('\0'):

164

if not string:

165

continue

166

name, integer = string.rsplit('.', 1)

167

# We really can't do anything with the branch information, so we

168

# just skip it

169

170

self.cache_mgr.revision_ids = {}

171

for line in f:

172

line = line.rstrip('\n')

173

mark, revid = line.split(' ', 1)

174

self.cache_mgr.revision_ids[mark] = revid

175

f.close()

176

177

def export_marks(self, filename):

178

try:

179

f = file(filename, 'w')

180

except IOError:

181

self.warning(

182

"Could not open export-marks file, not exporting marks")

183

return

184

f.write('format=1\n')

185

f.write('\0tmp.0\n')

186

for mark, revid in self.cache_mgr.revision_ids.iteritems():

187

f.write('%s %s\n' % (mark, revid))

188

f.close()

189

190

def pre_process(self):

191

self._start_time = time.time()

192

self._load_info_and_params()

193

self.cache_mgr = GenericCacheManager(self.info, self.verbose,

194

self.inventory_cache_size)

195

196

if self.params.get("import-marks") is not None:

197

self._import_marks(self.params.get("import-marks"))

198

self.skip_total = False

199

self.first_incremental_commit = True

200

else:

201

self.first_incremental_commit = False

202

self.skip_total = self._init_id_map()

203

if self.skip_total:

204

self.note("Found %d commits already loaded - "

205

"skipping over these ...", self.skip_total)

206

self._revision_count = 0

207

208

# mapping of tag name to revision_id

209

self.tags = {}

210

211

# Create the revision loader needed for committing

212

new_repo_api = hasattr(self.repo, 'revisions')

213

if new_repo_api:

214

self.loader = revisionloader.RevisionLoader2(self.repo)

215

elif not self._experimental:

216

self.loader = revisionloader.RevisionLoader1(self.repo)

217

else:

218

def fulltext_when(count):

219

total = self.total_commits

220

if total is not None and count == total:

221

fulltext = True

222

else:

223

# Create an inventory fulltext every 200 revisions

224

fulltext = count % 200 == 0

225

if fulltext:

226

self.note("%d commits - storing inventory as full-text",

227

count)

228

return fulltext

229

230

self.loader = revisionloader.ImportRevisionLoader1(

231

self.repo, self.inventory_cache_size,

232

fulltext_when=fulltext_when)

233

234

# Disable autopacking if the repo format supports it.

235

# THIS IS A HACK - there is no sanctioned way of doing this yet.

236

if isinstance(self.repo, pack_repo.KnitPackRepository):

237

self._original_max_pack_count = \

238

self.repo._pack_collection._max_pack_count

239

def _max_pack_count_for_import(total_revisions):

240

return total_revisions + 1

241

self.repo._pack_collection._max_pack_count = \

242

_max_pack_count_for_import

243

else:

244

self._original_max_pack_count = None

245

246

# Create a write group. This is committed at the end of the import.

247

# Checkpointing closes the current one and starts a new one.

248

self.repo.start_write_group()

249

250

def _load_info_and_params(self):

251

self._experimental = bool(self.params.get('experimental', False))

252

253

# This is currently hard-coded but might be configurable via

254

# parameters one day if that's needed

255

repo_transport = self.repo.control_files._transport

256

self.id_map_path = repo_transport.local_abspath("fastimport-id-map")

257

258

# Load the info file, if any

259

info_path = self.params.get('info')

260

if info_path is not None:

261

self.info = configobj.ConfigObj(info_path)

262

else:

263

self.info = None

264

265

# Decide how often to automatically report progress

266

# (not a parameter yet)

267

self.progress_every = _DEFAULT_AUTO_PROGRESS

268

if self.verbose:

269

self.progress_every = self.progress_every / 10

270

271

# Decide how often to automatically checkpoint

272

self.checkpoint_every = int(self.params.get('checkpoint',

273

_DEFAULT_AUTO_CHECKPOINT))

274

275

# Decide how big to make the inventory cache

276

self.inventory_cache_size = int(self.params.get('inv-cache',

277

_DEFAULT_INV_CACHE_SIZE))

278

279

# Find the maximum number of commits to import (None means all)

280

# and prepare progress reporting. Just in case the info file

281

# has an outdated count of commits, we store the max counts

282

# at which we need to terminate separately to the total used

283

# for progress tracking.

284

try:

285

self.max_commits = int(self.params['count'])

286

if self.max_commits < 0:

287

self.max_commits = None

288

except KeyError:

289

self.max_commits = None

290

if self.info is not None:

291

self.total_commits = int(self.info['Command counts']['commit'])

292

if (self.max_commits is not None and

293

self.total_commits > self.max_commits):

294

self.total_commits = self.max_commits

295

else:

296

self.total_commits = self.max_commits

297

298

def _process(self, command_iter):

299

# if anything goes wrong, abort the write group if any

300

try:

301

processor.ImportProcessor._process(self, command_iter)

302

except:

303

if self.repo is not None and self.repo.is_in_write_group():

304

self.repo.abort_write_group()

305

raise

306

307

def post_process(self):

308

# Commit the current write group and checkpoint the id map

309

self.repo.commit_write_group()

310

self._save_id_map()

311

312

if self.params.get("export-marks") is not None:

313

self.export_marks(self.params.get("export-marks"))

314

315

# Update the branches

316

self.note("Updating branch information ...")

317

updater = GenericBranchUpdater(self.repo, self.branch, self.cache_mgr,

318

helpers.invert_dictset(self.cache_mgr.heads),

319

self.cache_mgr.last_ref, self.tags)

320

branches_updated, branches_lost = updater.update()

321

self._branch_count = len(branches_updated)

322

323

# Tell the user about branches that were not created

324

if branches_lost:

325

if not self.repo.is_shared():

326

self.warning("Cannot import multiple branches into "

327

"an unshared repository")

328

self.warning("Not creating branches for these head revisions:")

329

for lost_info in branches_lost:

330

head_revision = lost_info[1]

331

branch_name = lost_info[0]

332

self.note("\t %s = %s", head_revision, branch_name)

333

334

# Update the working trees as requested and dump stats

335

self._tree_count = 0

336

remind_about_update = True

337

if self._branch_count == 0:

338

self.note("no branches to update")

339

self.note("no working trees to update")

340

remind_about_update = False

341

elif self.params.get('trees', False):

342

trees = self._get_working_trees(branches_updated)

343

if trees:

344

self.note("Updating the working trees ...")

345

if self.verbose:

346

report = delta._ChangeReporter()

347

else:

348

reporter = None

349

for wt in trees:

350

wt.update(reporter)

351

self._tree_count += 1

352

remind_about_update = False

353

else:

354

self.warning("No working trees available to update")

355

self.dump_stats()

356

357

# Finish up by telling the user what to do next.

358

if self._original_max_pack_count:

359

# We earlier disabled autopacking, creating one pack every

360

# checkpoint instead. We now pack the repository to optimise

361

# how data is stored.

362

if self._revision_count > self.checkpoint_every:

363

self.note("Packing repository ...")

364

self.repo.pack()

365

# To be conservative, packing puts the old packs and

366

# indices in obsolete_packs. We err on the side of

367

# optimism and clear out that directory to save space.

368

self.note("Removing obsolete packs ...")

369

# TODO: Use a public API for this once one exists

370

repo_transport = self.repo._pack_collection.transport

371

repo_transport.clone('obsolete_packs').delete_multi(

372

repo_transport.list_dir('obsolete_packs'))

373

if remind_about_update:

374

# This message is explicitly not timestamped.

375

note("To refresh the working tree for a branch, "

376

"use 'bzr update'.")

377

378

def _get_working_trees(self, branches):

379

"""Get the working trees for branches in the repository."""

380

result = []

381

wt_expected = self.repo.make_working_trees()

382

for br in branches:

383

if br == self.branch and br is not None:

384

wt = self.working_tree

385

elif wt_expected:

386

try:

387

wt = br.bzrdir.open_workingtree()

388

except errors.NoWorkingTree:

389

self.warning("No working tree for branch %s", br)

390

continue

391

else:

392

continue

393

result.append(wt)

394

return result

395

396

def dump_stats(self):

397

time_required = progress.str_tdelta(time.time() - self._start_time)

398

rc = self._revision_count - self.skip_total

399

bc = self._branch_count

400

wtc = self._tree_count

401

self.note("Imported %d %s, updating %d %s and %d %s in %s",

402

rc, helpers.single_plural(rc, "revision", "revisions"),

403

bc, helpers.single_plural(bc, "branch", "branches"),

404

wtc, helpers.single_plural(wtc, "tree", "trees"),

405

time_required)

406

407

def _init_id_map(self):

408

"""Load the id-map and check it matches the repository.

409

410

:return: the number of entries in the map

411

"""

412

# Currently, we just check the size. In the future, we might

413

# decide to be more paranoid and check that the revision-ids

414

# are identical as well.

415

self.cache_mgr.revision_ids, known = idmapfile.load_id_map(

416

self.id_map_path)

417

existing_count = len(self.repo.all_revision_ids())

418

if existing_count < known:

419

raise plugin_errors.BadRepositorySize(known, existing_count)

420

return known

421

422

def _save_id_map(self):

423

"""Save the id-map."""

424

# Save the whole lot every time. If this proves a problem, we can

425

# change to 'append just the new ones' at a later time.

426

idmapfile.save_id_map(self.id_map_path, self.cache_mgr.revision_ids)

427

428

def blob_handler(self, cmd):

429

"""Process a BlobCommand."""

430

if cmd.mark is not None:

431

dataref = cmd.id

432

else:

433

dataref = osutils.sha_strings(cmd.data)

434

self.cache_mgr.store_blob(dataref, cmd.data)

435

436

def checkpoint_handler(self, cmd):

437

"""Process a CheckpointCommand."""

438

# Commit the current write group and start a new one

439

self.repo.commit_write_group()

440

self._save_id_map()

441

self.repo.start_write_group()

442

443

def commit_handler(self, cmd):

444

"""Process a CommitCommand."""

445

if self.skip_total and self._revision_count < self.skip_total:

446

_track_heads(cmd, self.cache_mgr)

447

# Check that we really do know about this commit-id

448

if not self.cache_mgr.revision_ids.has_key(cmd.id):

449

raise plugin_errors.BadRestart(cmd.id)

450

# Consume the file commands and free any non-sticky blobs

451

for fc in cmd.file_iter():

452

pass

453

self.cache_mgr._blobs = {}

454

self._revision_count += 1

455

# If we're finished getting back to where we were,

456

# load the file-ids cache

457

if self._revision_count == self.skip_total:

458

self._gen_file_ids_cache()

459

self.note("Generated the file-ids cache - %d entries",

460

len(self.cache_mgr.file_ids.keys()))

461

return

462

if self.first_incremental_commit:

463

self.first_incremental_commit = None

464

parents = _track_heads(cmd, self.cache_mgr)

465

self._gen_file_ids_cache(parents)

466

467

# 'Commit' the revision and report progress

468

handler = GenericCommitHandler(cmd, self.repo, self.cache_mgr,

469

self.loader, self.verbose, self._experimental)

470

handler.process()

471

self.cache_mgr.revision_ids[cmd.id] = handler.revision_id

472

self._revision_count += 1

473

self.report_progress("(%s)" % cmd.id)

474

475

# Check if we should finish up or automatically checkpoint

476

if (self.max_commits is not None and

477

self._revision_count >= self.max_commits):

478

self.note("Stopping after reaching requested count of commits")

479

self.finished = True

480

elif self._revision_count % self.checkpoint_every == 0:

481

self.note("%d commits - automatic checkpoint triggered",

482

self._revision_count)

483

self.checkpoint_handler(None)

484

485

def _gen_file_ids_cache(self, revs=False):

486

"""Generate the file-id cache by searching repository inventories.

487

"""

488

# Get the interesting revisions - the heads

489

if revs:

490

head_ids = revs

491

else:

492

head_ids = self.cache_mgr.heads.keys()

493

revision_ids = [self.cache_mgr.revision_ids[h] for h in head_ids]

494

495

# Update the fileid cache

496

file_ids = {}

497

for revision_id in revision_ids:

498

inv = self.repo.revision_tree(revision_id).inventory

499

# Cache the inventories while we're at it

500

self.cache_mgr.inventories[revision_id] = inv

501

for path, ie in inv.iter_entries():

502

file_ids[path] = ie.file_id

503

self.cache_mgr.file_ids = file_ids

504

505

def report_progress(self, details=''):

506

# TODO: use a progress bar with ETA enabled

507

if self._revision_count % self.progress_every == 0:

508

if self.total_commits is not None:

509

counts = "%d/%d" % (self._revision_count, self.total_commits)

510

eta = progress.get_eta(self._start_time, self._revision_count,

511

self.total_commits)

512

eta_str = progress.str_tdelta(eta)

513

if eta_str.endswith('--'):

514

eta_str = ''

515

else:

516

eta_str = '[%s] ' % eta_str

517

else:

518

counts = "%d" % (self._revision_count,)

519

eta_str = ''

520

self.note("%s commits processed %s%s" % (counts, eta_str, details))

521

522

def progress_handler(self, cmd):

523

"""Process a ProgressCommand."""

524

# We could use a progress bar here instead

525

self.note("progress %s" % (cmd.message,))

526

527

def reset_handler(self, cmd):

528

"""Process a ResetCommand."""

529

if cmd.ref.startswith('refs/tags/'):

530

tag_name = cmd.ref[len('refs/tags/'):]

531

if cmd.from_ is not None:

532

self._set_tag(tag_name, cmd.from_)

533

elif self.verbose:

534

self.warning("ignoring reset refs/tags/%s - no from clause"

535

% tag_name)

536

return

537

538

# FIXME: cmd.from_ is a committish and thus could reference

539

# another branch. Create a method for resolving commitish's.

540

if cmd.from_ is not None:

541

self.cache_mgr.track_heads_for_ref(cmd.ref, cmd.from_)

542

# Why is this required now vs at the end?

543

#updater = GenericBranchUpdater(self.repo, self.branch, self.cache_mgr,

544

# helpers.invert_dictset(self.cache_mgr.heads),

545

# self.cache_mgr.last_ref, self.tags)

546

#updater.update()

547

548

def tag_handler(self, cmd):

549

"""Process a TagCommand."""

550

if cmd.from_ is not None:

551

self._set_tag(cmd.id, cmd.from_)

552

else:

553

self.warning("ignoring tag %s - no from clause" % cmd.id)

554

555

def _set_tag(self, name, from_):

556

"""Define a tag given a name and import 'from' reference."""

557

bzr_tag_name = name.decode('utf-8', 'replace')

558

bzr_rev_id = self.cache_mgr.revision_ids[from_]

559

self.tags[bzr_tag_name] = bzr_rev_id

560

561

562

class GenericCacheManager(object):

563

"""A manager of caches for the GenericProcessor."""

564

565

def __init__(self, info, verbose=False, inventory_cache_size=10):

566

"""Create a manager of caches.

567

568

:param info: a ConfigObj holding the output from

569

the --info processor, or None if no hints are available

570

"""

571

self.verbose = verbose

572

573

# dataref -> data. datref is either :mark or the sha-1.

574

# Sticky blobs aren't removed after being referenced.

575

self._blobs = {}

576

self._sticky_blobs = {}

577

578

# revision-id -> Inventory cache

579

# these are large and we probably don't need too many as

580

# most parents are recent in history

581

self.inventories = lru_cache.LRUCache(inventory_cache_size)

582

583

# import commmit-ids -> revision-id lookup table

584

# we need to keep all of these but they are small

585

self.revision_ids = {}

586

587

# path -> file-ids - as generated

588

self.file_ids = {}

589

590

# Head tracking: last ref, last id per ref & map of commit ids to ref*s*

591

self.last_ref = None

592

self.last_ids = {}

593

self.heads = {}

594

595

# Work out the blobs to make sticky - None means all

596

self._blobs_to_keep = None

597

if info is not None:

598

try:

599

self._blobs_to_keep = info['Blob usage tracking']['multi']

600

except KeyError:

601

# info not in file - possible when no blobs used

602

pass

603

604

def store_blob(self, id, data):

605

"""Store a blob of data."""

606

if (self._blobs_to_keep is None or data == '' or

607

id in self._blobs_to_keep):

608

self._sticky_blobs[id] = data

609

else:

610

self._blobs[id] = data

611

612

def fetch_blob(self, id):

613

"""Fetch a blob of data."""

614

try:

615

return self._sticky_blobs[id]

616

except KeyError:

617

return self._blobs.pop(id)

618

619

def _delete_path(self, path):

620

"""Remove a path from caches."""

621

# we actually want to remember what file-id we gave a path,

622

# even when that file is deleted, so doing nothing is correct

623

pass

624

625

def _rename_path(self, old_path, new_path):

626

"""Rename a path in the caches."""

627

# In this case, we need to forget the file-id we gave a path,

628

# otherwise, we'll get duplicate file-ids in the repository.

629

self.file_ids[new_path] = self.file_ids[old_path]

630

del self.file_ids[old_path]

631

632

def track_heads_for_ref(self, cmd_ref, cmd_id, parents=None):

633

if parents is not None:

634

for parent in parents:

635

refs = self.heads.get(parent)

636

if refs:

637

refs.discard(cmd_ref)

638

if not refs:

639

del self.heads[parent]

640

self.heads.setdefault(cmd_id, set()).add(cmd_ref)

641

self.last_ids[cmd_ref] = cmd_id

642

self.last_ref = cmd_ref

643

644

645

def _track_heads(cmd, cache_mgr):

646

"""Track the repository heads given a CommitCommand.

647

648

:return: the list of parents in terms of commit-ids

649

"""

650

# Get the true set of parents

651

if cmd.from_ is not None:

652

parents = [cmd.from_]

653

else:

654

last_id = cache_mgr.last_ids.get(cmd.ref)

655

if last_id is not None:

656

parents = [last_id]

657

else:

658

parents = []

659

parents.extend(cmd.merges)

660

661

# Track the heads

662

cache_mgr.track_heads_for_ref(cmd.ref, cmd.id, parents)

663

return parents

664

665

666

class GenericCommitHandler(processor.CommitHandler):

667

668

def __init__(self, command, repo, cache_mgr, loader, verbose=False,

669

_experimental=False):

670

processor.CommitHandler.__init__(self, command)

671

self.repo = repo

672

self.cache_mgr = cache_mgr

673

self.loader = loader

674

self.verbose = verbose

675

self._experimental = _experimental

676

677

def note(self, msg, *args):

678

"""Output a note but add context."""

679

msg = "%s (%s)" % (msg, self.command.id)

680

note(msg, *args)

681

682

def warning(self, msg, *args):

683

"""Output a warning but add context."""

684

msg = "WARNING: %s (%s)" % (msg, self.command.id)

685

warning(msg, *args)

686

687

def debug(self, msg, *args):

688

"""Output a mutter if the appropriate -D option was given."""

689

if "fast-import" in debug.debug_flags:

690

msg = "%s (%s)" % (msg, self.command.id)

691

mutter(msg, *args)

692

693

def pre_process_files(self):

694

"""Prepare for committing."""

695

self.revision_id = self.gen_revision_id()

696

# cache of texts for this commit, indexed by file-id

697

self.lines_for_commit = {}

698

if self.repo.supports_rich_root():

699

self.lines_for_commit[inventory.ROOT_ID] = []

700

701

# Track the heads and get the real parent list

702

parents = _track_heads(self.command, self.cache_mgr)

703

704

# Convert the parent commit-ids to bzr revision-ids

705

if parents:

706

self.parents = [self.cache_mgr.revision_ids[p]

707

for p in parents]

708

else:

709

self.parents = []

710

self.debug("%s id: %s, parents: %s", self.command.id,

711

self.revision_id, str(self.parents))

712

713

# Seed the inventory from the previous one

714

if len(self.parents) == 0:

715

self.inventory = self.gen_initial_inventory()

716

else:

717

# use the bzr_revision_id to lookup the inv cache

718

inv = self.get_inventory(self.parents[0])

719

# TODO: Shallow copy - deep inventory copying is expensive

720

self.inventory = inv.copy()

721

if self.repo.supports_rich_root():

722

self.inventory.revision_id = self.revision_id

723

else:

724

# In this repository, root entries have no knit or weave. When

725

# serializing out to disk and back in, root.revision is always

726

# the new revision_id.

727

self.inventory.root.revision = self.revision_id

728

729

# directory-path -> inventory-entry for current inventory

730

self.directory_entries = dict(self.inventory.directories())

731

732

def post_process_files(self):

733

"""Save the revision."""

734

self.cache_mgr.inventories[self.revision_id] = self.inventory

735

736

# Load the revision into the repository

737

rev_props = {}

738

committer = self.command.committer

739

who = "%s <%s>" % (committer[0],committer[1])

740

author = self.command.author

741

if author is not None:

742

author_id = "%s <%s>" % (author[0],author[1])

743

if author_id != who:

744

rev_props['author'] = author_id

745

rev = revision.Revision(

746

timestamp=committer[2],

747

timezone=committer[3],

748

committer=who,

749

message=self._escape_commit_message(self.command.message),

750

revision_id=self.revision_id,

751

properties=rev_props,

752

parent_ids=self.parents)

753

self.loader.load(rev, self.inventory, None,

754

lambda file_id: self._get_lines(file_id),

755

lambda revision_ids: self._get_inventories(revision_ids))

756

757

def _escape_commit_message(self, message):

758

"""Replace xml-incompatible control characters."""

759

# It's crap that we need to do this at this level (but we do)

760

# Code copied from bzrlib.commit.

761

762

# Python strings can include characters that can't be

763

# represented in well-formed XML; escape characters that

764

# aren't listed in the XML specification

765

# (http://www.w3.org/TR/REC-xml/#NT-Char).

766

message, _ = re.subn(

767

u'[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',

768

lambda match: match.group(0).encode('unicode_escape'),

769

message)

770

return message

771

772

def modify_handler(self, filecmd):

773

if filecmd.dataref is not None:

774

data = self.cache_mgr.fetch_blob(filecmd.dataref)

775

else:

776

data = filecmd.data

777

self.debug("modifying %s", filecmd.path)

778

self._modify_inventory(filecmd.path, filecmd.kind,

779

filecmd.is_executable, data)

780

781

def _delete_recursive(self, path):

782

self.debug("deleting %s", path)

783

fileid = self.bzr_file_id(path)

784

dirname, basename = osutils.split(path)

785

if (fileid in self.inventory and

786

isinstance(self.inventory[fileid], inventory.InventoryDirectory)):

787

for child_path in self.inventory[fileid].children.keys():

788

self._delete_recursive(os.utils.pathjoin(path, child_path))

789

try:

790

if self.inventory.id2path(fileid) == path:

791

del self.inventory[fileid]

792

else:

793

# already added by some other name?

794

if dirname in self.cache_mgr.file_ids:

795

parent_id = self.cache_mgr.file_ids[dirname]

796

del self.inventory[parent_id].children[basename]

797

except KeyError:

798

self._warn_unless_in_merges(fileid, path)

799

except errors.NoSuchId:

800

self._warn_unless_in_merges(fileid, path)

801

except AttributeError, ex:

802

if ex.args[0] == 'children':

803

# A directory has changed into a file and then one

804

# of it's children is being deleted!

805

self._warn_unless_in_merges(fileid, path)

806

else:

807

raise

808

try:

809

self.cache_mgr._delete_path(path)

810

except KeyError:

811

pass

812

813

def delete_handler(self, filecmd):

814

self._delete_recursive(filecmd.path)

815

816

def _warn_unless_in_merges(self, fileid, path):

817

if len(self.parents) <= 1:

818

return

819

for parent in self.parents[1:]:

820

if fileid in self.get_inventory(parent):

821

return

822

self.warning("ignoring delete of %s as not in parent inventories", path)

823

824

def copy_handler(self, filecmd):

825

raise NotImplementedError(self.copy_handler)

826

827

def rename_handler(self, filecmd):

828

old_path = filecmd.old_path

829

new_path = filecmd.new_path

830

self.debug("renaming %s to %s", old_path, new_path)

831

file_id = self.bzr_file_id(old_path)

832

basename, new_parent_ie = self._ensure_directory(new_path)

833

new_parent_id = new_parent_ie.file_id

834

existing_id = self.inventory.path2id(new_path)

835

if existing_id is not None:

836

self.inventory.remove_recursive_id(existing_id)

837

ie = self.inventory[file_id]

838

lines = self.loader._get_lines(file_id, ie.revision)

839

self.lines_for_commit[file_id] = lines

840

self.inventory.rename(file_id, new_parent_id, basename)

841

self.cache_mgr._rename_path(old_path, new_path)

842

self.inventory[file_id].revision = self.revision_id

843

844

def deleteall_handler(self, filecmd):

845

self.debug("deleting all files (and also all directories)")

846

# Would be nice to have an inventory.clear() method here

847

root_items = [ie for (name, ie) in

848

self.inventory.root.children.iteritems()]

849

for root_item in root_items:

850

self.inventory.remove_recursive_id(root_item.file_id)

851

852

def bzr_file_id_and_new(self, path):

853

"""Get a Bazaar file identifier and new flag for a path.

854

855

:return: file_id, is_new where

856

is_new = True if the file_id is newly created

857

"""

858

try:

859

id = self.cache_mgr.file_ids[path]

860

return id, False

861

except KeyError:

862

id = generate_ids.gen_file_id(path)

863

self.cache_mgr.file_ids[path] = id

864

self.debug("Generated new file id %s for '%s'", id, path)

865

return id, True

866

867

def bzr_file_id(self, path):

868

"""Get a Bazaar file identifier for a path."""

869

return self.bzr_file_id_and_new(path)[0]

870

871

def gen_initial_inventory(self):

872

"""Generate an inventory for a parentless revision."""

873

inv = inventory.Inventory(revision_id=self.revision_id)

874

if self.repo.supports_rich_root():

875

# The very first root needs to have the right revision

876

inv.root.revision = self.revision_id

877

return inv

878

879

def gen_revision_id(self):

880

"""Generate a revision id.

881

882

Subclasses may override this to produce deterministic ids say.

883

"""

884

committer = self.command.committer

885

# Perhaps 'who' being the person running the import is ok? If so,

886

# it might be a bit quicker and give slightly better compression?

887

who = "%s <%s>" % (committer[0],committer[1])

888

timestamp = committer[2]

889

return generate_ids.gen_revision_id(who, timestamp)

890

891

def get_inventory(self, revision_id):

892

"""Get the inventory for a revision id."""

893

try:

894

inv = self.cache_mgr.inventories[revision_id]

895

except KeyError:

896

if self.verbose:

897

self.note("get_inventory cache miss for %s", revision_id)

898

# Not cached so reconstruct from repository

899

inv = self.repo.revision_tree(revision_id).inventory

900

self.cache_mgr.inventories[revision_id] = inv

901

return inv

902

903

def _get_inventories(self, revision_ids):

904

"""Get the inventories for revision-ids.

905

906

This is a callback used by the RepositoryLoader to

907

speed up inventory reconstruction.

908

"""

909

present = []

910

inventories = []

911

# If an inventory is in the cache, we assume it was

912

# successfully loaded into the repsoitory

913

for revision_id in revision_ids:

914

try:

915

inv = self.cache_mgr.inventories[revision_id]

916

present.append(revision_id)

917

except KeyError:

918

if self.verbose:

919

self.note("get_inventories cache miss for %s", revision_id)

920

# Not cached so reconstruct from repository

921

if self.repo.has_revision(revision_id):

922

rev_tree = self.repo.revision_tree(revision_id)

923

present.append(revision_id)

924

else:

925

rev_tree = self.repo.revision_tree(None)

926

inv = rev_tree.inventory

927

self.cache_mgr.inventories[revision_id] = inv

928

inventories.append(inv)

929

return present, inventories

930

931

def _get_lines(self, file_id):

932

"""Get the lines for a file-id."""

933

return self.lines_for_commit[file_id]

934

935

def _modify_inventory(self, path, kind, is_executable, data):

936

"""Add to or change an item in the inventory."""

937

# Create the new InventoryEntry

938

basename, parent_ie = self._ensure_directory(path)

939

file_id = self.bzr_file_id(path)

940

ie = inventory.make_entry(kind, basename, parent_ie.file_id, file_id)

941

ie.revision = self.revision_id

942

if isinstance(ie, inventory.InventoryFile):

943

ie.executable = is_executable

944

lines = osutils.split_lines(data)

945

ie.text_sha1 = osutils.sha_strings(lines)

946

ie.text_size = sum(map(len, lines))

947

self.lines_for_commit[file_id] = lines

948

elif isinstance(ie, inventory.InventoryLink):

949

ie.symlink_target = data.encode('utf8')

950

# There are no lines stored for a symlink so

951

# make sure the cache used by get_lines knows that

952

self.lines_for_commit[file_id] = []

953

else:

954

raise errors.BzrError("Cannot import items of kind '%s' yet" %

955

(kind,))

956

957

# Record this new inventory entry

958

if file_id in self.inventory:

959

# HACK: no API for this (del+add does more than it needs to)

960

self.inventory._byid[file_id] = ie

961

parent_ie.children[basename] = ie

962

else:

963

self.inventory.add(ie)

964

965

def _ensure_directory(self, path):

966

"""Ensure that the containing directory exists for 'path'"""

967

dirname, basename = osutils.split(path)

968

if dirname == '':

969

# the root node doesn't get updated

970

return basename, self.inventory.root

971

try:

972

ie = self.directory_entries[dirname]

973

except KeyError:

974

# We will create this entry, since it doesn't exist

975

pass

976

else:

977

return basename, ie

978

979

# No directory existed, we will just create one, first, make sure

980

# the parent exists

981

dir_basename, parent_ie = self._ensure_directory(dirname)

982

dir_file_id = self.bzr_file_id(dirname)

983

ie = inventory.entry_factory['directory'](dir_file_id,

984

dir_basename,

985

parent_ie.file_id)

986

ie.revision = self.revision_id

987

self.directory_entries[dirname] = ie

988

# There are no lines stored for a directory so

989

# make sure the cache used by get_lines knows that

990

self.lines_for_commit[dir_file_id] = []

991

#print "adding dir for %s" % path

992

self.inventory.add(ie)

993

return basename, ie

994

995

996

class GenericBranchUpdater(object):

997

998

def __init__(self, repo, branch, cache_mgr, heads_by_ref, last_ref, tags):

999

"""Create an object responsible for updating branches.

1000

1001

:param heads_by_ref: a dictionary where

1002

names are git-style references like refs/heads/master;

1003

values are one item lists of commits marks.

1004

"""

1005

self.repo = repo

1006

self.branch = branch

1007

self.cache_mgr = cache_mgr

1008

self.heads_by_ref = heads_by_ref

1009

self.last_ref = last_ref

1010

self.tags = tags

1011

1012

def update(self):

1013

"""Update the Bazaar branches and tips matching the heads.

1014

1015

If the repository is shared, this routine creates branches

1016

as required. If it isn't, warnings are produced about the

1017

lost of information.

1018

1019

:return: updated, lost_heads where

1020

updated = the list of branches updated

1021

lost_heads = a list of (bazaar-name,revision) for branches that

1022

would have been created had the repository been shared

1023

"""

1024

updated = []

1025

branch_tips, lost_heads = self._get_matching_branches()

1026

for br, tip in branch_tips:

1027

if self._update_branch(br, tip):

1028

updated.append(br)

1029

return updated, lost_heads

1030

1031

def _get_matching_branches(self):

1032

"""Get the Bazaar branches.

1033

1034

:return: default_tip, branch_tips, lost_heads where

1035

default_tip = the last commit mark for the default branch

1036

branch_tips = a list of (branch,tip) tuples for other branches.

1037

lost_heads = a list of (bazaar-name,revision) for branches that

1038

would have been created had the repository been shared and

1039

everything succeeded

1040

"""

1041

branch_tips = []

1042

lost_heads = []

1043

ref_names = self.heads_by_ref.keys()

1044

if self.branch is not None:

1045

trunk = self.select_trunk(ref_names)

1046

default_tip = self.heads_by_ref[trunk][0]

1047

branch_tips.append((self.branch, default_tip))

1048

ref_names.remove(trunk)

1049

1050

# Convert the reference names into Bazaar speak

1051

bzr_names = self._get_bzr_names_from_ref_names(ref_names)

1052

1053

# Policy for locating branches

1054

def dir_under_current(name, ref_name):

1055

# Using the Bazaar name, get a directory under the current one

1056

return name

1057

def dir_sister_branch(name, ref_name):

1058

# Using the Bazaar name, get a sister directory to the branch

1059

return osutils.pathjoin(self.branch.base, "..", name)

1060

if self.branch is not None:

1061

dir_policy = dir_sister_branch

1062

else:

1063

dir_policy = dir_under_current

1064

1065

# Create/track missing branches

1066

shared_repo = self.repo.is_shared()

1067

for name in sorted(bzr_names.keys()):

1068

ref_name = bzr_names[name]

1069

tip = self.heads_by_ref[ref_name][0]

1070

if shared_repo:

1071

location = dir_policy(name, ref_name)

1072

try:

1073

br = self.make_branch(location)

1074

branch_tips.append((br,tip))

1075

continue

1076

except errors.BzrError, ex:

1077

error("ERROR: failed to create branch %s: %s",

1078

location, ex)

1079

lost_head = self.cache_mgr.revision_ids[tip]

1080

lost_info = (name, lost_head)

1081

lost_heads.append(lost_info)

1082

return branch_tips, lost_heads

1083

1084

def select_trunk(self, ref_names):

1085

"""Given a set of ref names, choose one as the trunk."""

1086

for candidate in ['refs/heads/master']:

1087

if candidate in ref_names:

1088

return candidate

1089

# Use the last reference in the import stream

1090

return self.last_ref

1091

1092

def make_branch(self, location):

1093

"""Make a branch in the repository if not already there."""

1094

try:

1095

return bzrdir.BzrDir.open(location).open_branch()

1096

except errors.NotBranchError, ex:

1097

return bzrdir.BzrDir.create_branch_convenience(location)

1098

1099

def _get_bzr_names_from_ref_names(self, ref_names):

1100

"""Generate Bazaar branch names from import ref names.

1101

1102

:return: a dictionary with Bazaar names as keys and

1103

the original reference names as values.

1104

"""

1105

bazaar_names = {}

1106

for ref_name in sorted(ref_names):

1107

parts = ref_name.split('/')

1108

if parts[0] == 'refs':

1109

parts.pop(0)

1110

full_name = "--".join(parts)

1111

bazaar_name = parts[-1]

1112

if bazaar_name in bazaar_names:

1113

if parts[0] == 'remotes':

1114

bazaar_name += ".remote"

1115

else:

1116

bazaar_name = full_name

1117

bazaar_names[bazaar_name] = ref_name

1118

return bazaar_names

1119

1120

def _update_branch(self, br, last_mark):

1121

"""Update a branch with last revision and tag information.

1122

1123

:return: whether the branch was changed or not

1124

"""

1125

last_rev_id = self.cache_mgr.revision_ids[last_mark]

1126

revs = list(self.repo.iter_reverse_revision_history(last_rev_id))

1127

revno = len(revs)

1128

existing_revno, existing_last_rev_id = br.last_revision_info()

1129

changed = False

1130

if revno != existing_revno or last_rev_id != existing_last_rev_id:

1131

br.set_last_revision_info(revno, last_rev_id)

1132

changed = True

1133

# apply tags known in this branch

1134

my_tags = {}

1135

if self.tags:

1136

for tag,rev in self.tags.items():

1137

if rev in revs:

1138

my_tags[tag] = rev

1139

if my_tags:

1140

br.tags._set_tag_dict(my_tags)

1141

changed = True

1142

if changed:

1143

tagno = len(my_tags)

1144

note("\t branch %s now has %d %s and %d %s", br.nick,

1145

revno, helpers.single_plural(revno, "revision", "revisions"),

1146

tagno, helpers.single_plural(tagno, "tag", "tags"))

1147

return changed