~ubuntu-branches/debian/jessie/bzr-fastimport/jessie

« back to all changes in this revision

Viewing changes to parser.py

Committer: Bazaar Package Importer
Author(s): Jelmer Vernooij
Date: 2010-11-06 18:40:27 UTC
mfrom: (1.1.6 upstream)
Revision ID: james.westby@ubuntu.com-20101106184027-iclo8iim9equ6i8b

Tags: 0.9.0+bzr279-1

* New upstream snapshot.
* Bump standards version to 3.9.1 (no changes).
* Run testsuite during package build.

files added:
.pc

.pc/.version

.pc/applied-patches

.pc/debian-changes-0.9.0+bzr279-1

.pc/debian-changes-0.9.0+bzr279-1/tests

.pc/debian-changes-0.9.0+bzr279-1/tests/test_branch_mapper.py

.pc/debian-changes-0.9.0+bzr279-1/tests/test_filter_processor.py

.pc/debian-changes-0.9.0+bzr279-1/tests/test_head_tracking.py

.pc/debian-changes-0.9.0+bzr279-1/tests/test_helpers.py

debian/patches

debian/patches/debian-changes-0.9.0+bzr279-1

debian/patches/series

exporters/darcs/t/testimport-gitsymlink.sh

files removed:
commands.py

dates.py

errors.py

idmapfile.py

parser.py

processor.py

processors/filter_processor.py

processors/info_processor.py

processors/query_processor.py

tests/test_commands.py

tests/test_errors.py

tests/test_parser.py

files modified:
NEWS

README.txt

__init__.py

branch_updater.py

bzr_commit_handler.py

bzr_exporter.py

cache_manager.py

debian/changelog

debian/control

debian/rules

exporters/darcs/TODO

exporters/darcs/darcs-fast-export

exporters/darcs/darcs-fast-import

exporters/darcs/darcs-fast-import.txt

exporters/darcs/git-darcs

exporters/darcs/git-darcs.txt

exporters/darcs/t/lib.sh

exporters/darcs/x2d

exporters/darcs/x2d.txt

helpers.py

processors/generic_processor.py

revision_store.py

setup.py

tests/__init__.py

tests/test_generic_processor.py

tests/test_revision_store.py

Show diffs side-by-side

added added

removed removed

parser.py

# This program is free software; you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation; either version 2 of the License, or

# (at your option) any later version.

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with this program; if not, write to the Free Software

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"""Parser of import data into command objects.

In order to reuse existing front-ends, the stream format is a subset of

the one used by git-fast-import (as of the 1.5.4 release of git at least).

The grammar is:

stream ::= cmd*;

cmd ::= new_blob

| new_commit

| new_tag

| reset_branch

| checkpoint

| progress

;

new_blob ::= 'blob' lf

mark?

file_content;

file_content ::= data;

new_commit ::= 'commit' sp ref_str lf

mark?

('author' sp name '<' email '>' when lf)?

'committer' sp name '<' email '>' when lf

commit_msg

('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?

('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*

file_change*

lf?;

commit_msg ::= data;

file_change ::= file_clr

| file_del

| file_rnm

| file_cpy

| file_obm

| file_inm;

file_clr ::= 'deleteall' lf;

file_del ::= 'D' sp path_str lf;

file_rnm ::= 'R' sp path_str sp path_str lf;

file_cpy ::= 'C' sp path_str sp path_str lf;

file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;

file_inm ::= 'M' sp mode sp 'inline' sp path_str lf

data;

new_tag ::= 'tag' sp tag_str lf

'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf

'tagger' sp name '<' email '>' when lf

tag_msg;

tag_msg ::= data;

reset_branch ::= 'reset' sp ref_str lf

('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?

lf?;

checkpoint ::= 'checkpoint' lf

lf?;

progress ::= 'progress' sp not_lf* lf

lf?;

# note: the first idnum in a stream should be 1 and subsequent

# idnums should not have gaps between values as this will cause

# the stream parser to reserve space for the gapped values. An

# idnum can be updated in the future to a new object by issuing

# a new mark directive with the old idnum.

mark ::= 'mark' sp idnum lf;

data ::= (delimited_data | exact_data)

lf?;

# note: delim may be any string but must not contain lf.

# data_line may contain any data but must not be exactly

# delim. The lf after the final data_line is included in

# the data.

delimited_data ::= 'data' sp '<<' delim lf

(data_line lf)*

delim lf;

# note: declen indicates the length of binary_data in bytes.

# declen does not include the lf preceeding the binary data.

100

exact_data ::= 'data' sp declen lf

101

binary_data;

102

103

# note: quoted strings are C-style quoting supporting \c for

104

# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn

105

# is the signed byte value in octal. Note that the only

106

# characters which must actually be escaped to protect the

107

# stream formatting is: \, " and LF. Otherwise these values

108

# are UTF8.

109

110

ref_str ::= ref;

111

sha1exp_str ::= sha1exp;

112

tag_str ::= tag;

113

path_str ::= path | '"' quoted(path) '"' ;

114

mode ::= '100644' | '644'

115

| '100755' | '755'

116

| '120000'

117

;

118

119

declen ::= # unsigned 32 bit value, ascii base10 notation;

120

bigint ::= # unsigned integer value, ascii base10 notation;

121

binary_data ::= # file content, not interpreted;

122

123

when ::= raw_when | rfc2822_when;

124

raw_when ::= ts sp tz;

125

rfc2822_when ::= # Valid RFC 2822 date and time;

126

127

sp ::= # ASCII space character;

128

lf ::= # ASCII newline (LF) character;

129

130

# note: a colon (':') must precede the numerical value assigned to

131

# an idnum. This is to distinguish it from a ref or tag name as

132

# GIT does not permit ':' in ref or tag strings.

133

134

idnum ::= ':' bigint;

135

path ::= # GIT style file path, e.g. "a/b/c";

136

ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";

137

tag ::= # GIT tag name, e.g. "FIREFOX_1_5";

138

sha1exp ::= # Any valid GIT SHA1 expression;

139

hexsha1 ::= # SHA1 in hexadecimal format;

140

141

# note: name and email are UTF8 strings, however name must not

142

# contain '<' or lf and email must not contain any of the

143

# following: '<', '>', lf.

144

145

name ::= # valid GIT author/committer name;

146

email ::= # valid GIT author/committer email;

147

ts ::= # time since the epoch in seconds, ascii base10 notation;

148

tz ::= # GIT style timezone;

149

150

# note: comments may appear anywhere in the input, except

151

# within a data command. Any form of the data command

152

# always escapes the related input from comment processing.

153

154

# In case it is not clear, the '#' that starts the comment

155

# must be the first character on that the line (an lf have

156

# preceeded it).

157

158

comment ::= '#' not_lf* lf;

159

not_lf ::= # Any byte that is not ASCII newline (LF);

160

"""

161

162

163

import re

164

import sys

165

166

import commands

167

import dates

168

import errors

169

170

171

## Stream parsing ##

172

173

class LineBasedParser(object):

174

175

def __init__(self, input):

176

"""A Parser that keeps track of line numbers.

177

178

:param input: the file-like object to read from

179

"""

180

self.input = input

181

self.lineno = 0

182

# Lines pushed back onto the input stream

183

self._buffer = []

184

185

def abort(self, exception, *args):

186

"""Raise an exception providing line number information."""

187

raise exception(self.lineno, *args)

188

189

def readline(self):

190

"""Get the next line including the newline or '' on EOF."""

191

self.lineno += 1

192

if self._buffer:

193

return self._buffer.pop()

194

else:

195

return self.input.readline()

196

197

def next_line(self):

198

"""Get the next line without the newline or None on EOF."""

199

line = self.readline()

200

if line:

201

return line[:-1]

202

else:

203

return None

204

205

def push_line(self, line):

206

"""Push line back onto the line buffer.

207

208

:param line: the line with no trailing newline

209

"""

210

self.lineno -= 1

211

self._buffer.append(line + "\n")

212

213

def read_bytes(self, count):

214

"""Read a given number of bytes from the input stream.

215

216

Throws MissingBytes if the bytes are not found.

217

218

Note: This method does not read from the line buffer.

219

220

:return: a string

221

"""

222

result = self.input.read(count)

223

found = len(result)

224

self.lineno += result.count("\n")

225

if found != count:

226

self.abort(errors.MissingBytes, count, found)

227

return result

228

229

def read_until(self, terminator):

230

"""Read the input stream until the terminator is found.

231

232

Throws MissingTerminator if the terminator is not found.

233

234

Note: This method does not read from the line buffer.

235

236

:return: the bytes read up to but excluding the terminator.

237

"""

238

239

lines = []

240

term = terminator + '\n'

241

while True:

242

line = self.input.readline()

243

if line == term:

244

break

245

else:

246

lines.append(line)

247

return ''.join(lines)

248

249

250

# Regular expression used for parsing. (Note: The spec states that the name

251

# part should be non-empty but git-fast-export doesn't always do that so

252

# the first bit is \w*, not \w+.) Also git-fast-import code says the

253

# space before the email is optional.

254

_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')

255

_WHO_RE = re.compile(r'([^<]*)<(.*)>')

256

257

258

class ImportParser(LineBasedParser):

259

260

def __init__(self, input, verbose=False, output=sys.stdout,

261

user_mapper=None):

262

"""A Parser of import commands.

263

264

:param input: the file-like object to read from

265

:param verbose: display extra information of not

266

:param output: the file-like object to write messages to (YAGNI?)

267

:param user_mapper: if not None, the UserMapper used to adjust

268

user-ids for authors, committers and taggers.

269

"""

270

LineBasedParser.__init__(self, input)

271

self.verbose = verbose

272

self.output = output

273

self.user_mapper = user_mapper

274

# We auto-detect the date format when a date is first encountered

275

self.date_parser = None

276

277

def warning(self, msg):

278

sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))

279

280

def iter_commands(self):

281

"""Iterator returning ImportCommand objects."""

282

while True:

283

line = self.next_line()

284

if line is None:

285

break

286

elif len(line) == 0 or line.startswith('#'):

287

continue

288

# Search for commands in order of likelihood

289

elif line.startswith('commit '):

290

yield self._parse_commit(line[len('commit '):])

291

elif line.startswith('blob'):

292

yield self._parse_blob()

293

elif line.startswith('progress '):

294

yield commands.ProgressCommand(line[len('progress '):])

295

elif line.startswith('reset '):

296

yield self._parse_reset(line[len('reset '):])

297

elif line.startswith('tag '):

298

yield self._parse_tag(line[len('tag '):])

299

elif line.startswith('checkpoint'):

300

yield commands.CheckpointCommand()

301

elif line.startswith('feature'):

302

yield self._parse_feature(line[len('feature '):])

303

else:

304

self.abort(errors.InvalidCommand, line)

305

306

def iter_file_commands(self):

307

"""Iterator returning FileCommand objects.

308

309

If an invalid file command is found, the line is silently

310

pushed back and iteration ends.

311

"""

312

while True:

313

line = self.next_line()

314

if line is None:

315

break

316

elif len(line) == 0 or line.startswith('#'):

317

continue

318

# Search for file commands in order of likelihood

319

elif line.startswith('M '):

320

yield self._parse_file_modify(line[2:])

321

elif line.startswith('D '):

322

path = self._path(line[2:])

323

yield commands.FileDeleteCommand(path)

324

elif line.startswith('R '):

325

old, new = self._path_pair(line[2:])

326

yield commands.FileRenameCommand(old, new)

327

elif line.startswith('C '):

328

src, dest = self._path_pair(line[2:])

329

yield commands.FileCopyCommand(src, dest)

330

elif line.startswith('deleteall'):

331

yield commands.FileDeleteAllCommand()

332

else:

333

self.push_line(line)

334

break

335

336

def _parse_blob(self):

337

"""Parse a blob command."""

338

lineno = self.lineno

339

mark = self._get_mark_if_any()

340

data = self._get_data('blob')

341

return commands.BlobCommand(mark, data, lineno)

342

343

def _parse_commit(self, ref):

344

"""Parse a commit command."""

345

lineno = self.lineno

346

mark = self._get_mark_if_any()

347

author = self._get_user_info('commit', 'author', False)

348

more_authors = []

349

while True:

350

another_author = self._get_user_info('commit', 'author', False)

351

if another_author is not None:

352

more_authors.append(another_author)

353

else:

354

break

355

committer = self._get_user_info('commit', 'committer')

356

message = self._get_data('commit', 'message')

357

try:

358

message = message.decode('utf_8')

359

except UnicodeDecodeError:

360

self.warning(

361

"commit message not in utf8 - replacing unknown characters")

362

message = message.decode('utf_8', 'replace')

363

from_ = self._get_from()

364

merges = []

365

while True:

366

merge = self._get_merge()

367

if merge is not None:

368

# while the spec suggests it's illegal, git-fast-export

369

# outputs multiple merges on the one line, e.g.

370

# merge :x :y :z

371

these_merges = merge.split(" ")

372

merges.extend(these_merges)

373

else:

374

break

375

properties = {}

376

while True:

377

name_value = self._get_property()

378

if name_value is not None:

379

name, value = name_value

380

properties[name] = value

381

else:

382

break

383

return commands.CommitCommand(ref, mark, author, committer, message,

384

from_, merges, self.iter_file_commands, lineno=lineno,

385

more_authors=more_authors, properties=properties)

386

387

def _parse_feature(self, info):

388

"""Parse a feature command."""

389

parts = info.split("=", 1)

390

name = parts[0]

391

if len(parts) > 1:

392

value = self._path(parts[1])

393

else:

394

value = None

395

return commands.FeatureCommand(name, value, lineno=self.lineno)

396

397

def _parse_file_modify(self, info):

398

"""Parse a filemodify command within a commit.

399

400

:param info: a string in the format "mode dataref path"

401

(where dataref might be the hard-coded literal 'inline').

402

"""

403

params = info.split(' ', 2)

404

path = self._path(params[2])

405

is_executable, kind = self._mode(params[0])

406

if params[1] == 'inline':

407

dataref = None

408

data = self._get_data('filemodify')

409

else:

410

dataref = params[1]

411

data = None

412

return commands.FileModifyCommand(path, kind, is_executable, dataref,

413

data)

414

415

def _parse_reset(self, ref):

416

"""Parse a reset command."""

417

from_ = self._get_from()

418

return commands.ResetCommand(ref, from_)

419

420

def _parse_tag(self, name):

421

"""Parse a tag command."""

422

from_ = self._get_from('tag')

423

tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)

424

message = self._get_data('tag', 'message').decode('utf_8')

425

return commands.TagCommand(name, from_, tagger, message)

426

427

def _get_mark_if_any(self):

428

"""Parse a mark section."""

429

line = self.next_line()

430

if line.startswith('mark :'):

431

return line[len('mark :'):]

432

else:

433

self.push_line(line)

434

return None

435

436

def _get_from(self, required_for=None):

437

"""Parse a from section."""

438

line = self.next_line()

439

if line is None:

440

return None

441

elif line.startswith('from '):

442

return line[len('from '):]

443

elif required_for:

444

self.abort(errors.MissingSection, required_for, 'from')

445

else:

446

self.push_line(line)

447

return None

448

449

def _get_merge(self):

450

"""Parse a merge section."""

451

line = self.next_line()

452

if line is None:

453

return None

454

elif line.startswith('merge '):

455

return line[len('merge '):]

456

else:

457

self.push_line(line)

458

return None

459

460

def _get_property(self):

461

"""Parse a property section."""

462

line = self.next_line()

463

if line is None:

464

return None

465

elif line.startswith('property '):

466

return self._name_value(line[len('property '):])

467

else:

468

self.push_line(line)

469

return None

470

471

def _get_user_info(self, cmd, section, required=True,

472

accept_just_who=False):

473

"""Parse a user section."""

474

line = self.next_line()

475

if line.startswith(section + ' '):

476

return self._who_when(line[len(section + ' '):], cmd, section,

477

accept_just_who=accept_just_who)

478

elif required:

479

self.abort(errors.MissingSection, cmd, section)

480

else:

481

self.push_line(line)

482

return None

483

484

def _get_data(self, required_for, section='data'):

485

"""Parse a data section."""

486

line = self.next_line()

487

if line.startswith('data '):

488

rest = line[len('data '):]

489

if rest.startswith('<<'):

490

return self.read_until(rest[2:])

491

else:

492

size = int(rest)

493

read_bytes = self.read_bytes(size)

494

# optional LF after data.

495

next = self.input.readline()

496

self.lineno += 1

497

if len(next) > 1 or next != "\n":

498

self.push_line(next[:-1])

499

return read_bytes

500

else:

501

self.abort(errors.MissingSection, required_for, section)

502

503

def _who_when(self, s, cmd, section, accept_just_who=False):

504

"""Parse who and when information from a string.

505

506

:return: a tuple of (name,email,timestamp,timezone). name may be

507

the empty string if only an email address was given.

508

"""

509

match = _WHO_AND_WHEN_RE.search(s)

510

if match:

511

datestr = match.group(3).lstrip()

512

if self.date_parser is None:

513

# auto-detect the date format

514

if len(datestr.split(' ')) == 2:

515

format = 'raw'

516

elif datestr == 'now':

517

format = 'now'

518

else:

519

format = 'rfc2822'

520

self.date_parser = dates.DATE_PARSERS_BY_NAME[format]

521

try:

522

when = self.date_parser(datestr, self.lineno)

523

except ValueError:

524

print "failed to parse datestr '%s'" % (datestr,)

525

raise

526

else:

527

match = _WHO_RE.search(s)

528

if accept_just_who and match:

529

# HACK around missing time

530

# TODO: output a warning here

531

when = dates.DATE_PARSERS_BY_NAME['now']('now')

532

else:

533

self.abort(errors.BadFormat, cmd, section, s)

534

name = match.group(1)

535

if len(name) > 0:

536

if name[-1] == " ":

537

try:

538

name = name[:-1].decode('utf_8')

539

except UnicodeDecodeError:

540

# The spec says names are *typically* utf8 encoded

541

# but that isn't enforced by git-fast-export (at least)

542

self.warning("%s name not in utf8 - replacing unknown "

543

"characters" % (section,))

544

name = name[:-1].decode('utf_8', 'replace')

545

email = match.group(2)

546

# While it shouldn't happen, some datasets have email addresses

547

# which contain unicode characters. See bug 338186. We sanitize

548

# the data at this level just in case.

549

try:

550

email = email.decode('utf_8')

551

except UnicodeDecodeError:

552

self.warning("%s email not in utf8 - replacing unknown characters"

553

% (section,))

554

email = email.decode('utf_8', 'replace')

555

if self.user_mapper:

556

name, email = self.user_mapper.map_name_and_email(name, email)

557

return (name, email, when[0], when[1])

558

559

def _name_value(self, s):

560

"""Parse a (name,value) tuple from 'name value-length value'."""

561

parts = s.split(' ', 2)

562

name = parts[0]

563

if len(parts) == 1:

564

value = None

565

else:

566

size = int(parts[1])

567

value = parts[2]

568

still_to_read = size - len(value)

569

if still_to_read > 0:

570

read_bytes = self.read_bytes(still_to_read)

571

value += "\n" + read_bytes[:still_to_read - 1]

572

value = value.decode('utf8')

573

return (name, value)

574

575

def _path(self, s):

576

"""Parse a path."""

577

if s.startswith('"'):

578

if s[-1] != '"':

579

self.abort(errors.BadFormat, '?', '?', s)

580

else:

581

return _unquote_c_string(s[1:-1])

582

try:

583

return s.decode('utf_8')

584

except UnicodeDecodeError:

585

# The spec recommends utf8 encoding but that isn't enforced

586

return s

587

588

def _path_pair(self, s):

589

"""Parse two paths separated by a space."""

590

# TODO: handle a space in the first path

591

if s.startswith('"'):

592

parts = s[1:].split('" ', 1)

593

else:

594

parts = s.split(' ', 1)

595

if len(parts) != 2:

596

self.abort(errors.BadFormat, '?', '?', s)

597

elif parts[1].startswith('"') and parts[1].endswith('"'):

598

parts[1] = parts[1][1:-1]

599

elif parts[1].startswith('"') or parts[1].endswith('"'):

600

self.abort(errors.BadFormat, '?', '?', s)

601

return map(_unquote_c_string, parts)

602

603

def _mode(self, s):

604

"""Parse a file mode into executable and kind.

605

606

:return (is_executable, kind)

607

"""

608

# Note: Output from git-fast-export slightly different to spec

609

if s in ['644', '100644', '0100644']:

610

return False, commands.FILE_KIND

611

elif s in ['755', '100755', '0100755']:

612

return True, commands.FILE_KIND

613

elif s in ['040000', '0040000']:

614

return False, commands.DIRECTORY_KIND

615

elif s in ['120000', '0120000']:

616

return False, commands.SYMLINK_KIND

617

elif s in ['160000', '0160000']:

618

return False, commands.TREE_REFERENCE_KIND

619

else:

620

self.abort(errors.BadFormat, 'filemodify', 'mode', s)

621

622

623

def _unquote_c_string(s):

624

"""replace C-style escape sequences (\n, \", etc.) with real chars."""

625

# HACK: Python strings are close enough

626

return s.decode('string_escape', 'replace')

Older »