~zyga/bzr-fastimport/fixes : revision 1

1

2

#

3

# This program is free software; you can redistribute it and/or modify

4

# it under the terms of the GNU General Public License as published by

5

# the Free Software Foundation; either version 2 of the License, or

6

# (at your option) any later version.

7

#

8

# This program is distributed in the hope that it will be useful,

9

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# GNU General Public License for more details.

12

#

13

# You should have received a copy of the GNU General Public License

14

# along with this program; if not, write to the Free Software

15

# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

16

17

"""Parser of import data into command objects.

18

19

In order to reuse existing front-ends, the stream format is a subset of

20

the one used by git-fast-import (as of the 1.5.4 release of git at least).

21

The grammar is:

22

23

stream ::= cmd*;

24

25

cmd ::= new_blob

26

| new_commit

27

| new_tag

28

| reset_branch

29

| checkpoint

30

| progress

31

;

32

33

new_blob ::= 'blob' lf

34

mark?

35

file_content;

36

file_content ::= data;

37

38

new_commit ::= 'commit' sp ref_str lf

39

mark?

40

('author' sp name '<' email '>' when lf)?

41

'committer' sp name '<' email '>' when lf

42

commit_msg

43

('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?

44

('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*

45

file_change*

46

lf?;

47

commit_msg ::= data;

48

49

file_change ::= file_clr

50

| file_del

51

| file_rnm

52

| file_cpy

53

| file_obm

54

| file_inm;

55

file_clr ::= 'deleteall' lf;

56

file_del ::= 'D' sp path_str lf;

57

file_rnm ::= 'R' sp path_str sp path_str lf;

58

file_cpy ::= 'C' sp path_str sp path_str lf;

59

file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;

60

file_inm ::= 'M' sp mode sp 'inline' sp path_str lf

61

data;

62

63

new_tag ::= 'tag' sp tag_str lf

64

'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf

65

'tagger' sp name '<' email '>' when lf

66

tag_msg;

67

tag_msg ::= data;

68

69

reset_branch ::= 'reset' sp ref_str lf

70

('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?

71

lf?;

72

73

checkpoint ::= 'checkpoint' lf

74

lf?;

75

76

progress ::= 'progress' sp not_lf* lf

77

lf?;

78

79

# note: the first idnum in a stream should be 1 and subsequent

80

# idnums should not have gaps between values as this will cause

81

# the stream parser to reserve space for the gapped values. An

82

# idnum can be updated in the future to a new object by issuing

83

# a new mark directive with the old idnum.

84

#

85

mark ::= 'mark' sp idnum lf;

86

data ::= (delimited_data | exact_data)

87

lf?;

88

89

# note: delim may be any string but must not contain lf.

90

# data_line may contain any data but must not be exactly

91

# delim.

92

delimited_data ::= 'data' sp '<<' delim lf

93

(data_line lf)*

94

delim lf;

95

96

# note: declen indicates the length of binary_data in bytes.

97

# declen does not include the lf preceeding the binary data.

98

#

99

exact_data ::= 'data' sp declen lf

100

binary_data;

101

102

# note: quoted strings are C-style quoting supporting \c for

103

# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn

104

# is the signed byte value in octal. Note that the only

105

# characters which must actually be escaped to protect the

106

# stream formatting is: \, " and LF. Otherwise these values

107

# are UTF8.

108

#

109

ref_str ::= ref;

110

sha1exp_str ::= sha1exp;

111

tag_str ::= tag;

112

path_str ::= path | '"' quoted(path) '"' ;

113

mode ::= '100644' | '644'

114

| '100755' | '755'

115

| '120000'

116

;

117

118

declen ::= # unsigned 32 bit value, ascii base10 notation;

119

bigint ::= # unsigned integer value, ascii base10 notation;

120

binary_data ::= # file content, not interpreted;

121

122

when ::= raw_when | rfc2822_when;

123

raw_when ::= ts sp tz;

124

rfc2822_when ::= # Valid RFC 2822 date and time;

125

126

sp ::= # ASCII space character;

127

lf ::= # ASCII newline (LF) character;

128

129

# note: a colon (':') must precede the numerical value assigned to

130

# an idnum. This is to distinguish it from a ref or tag name as

131

# GIT does not permit ':' in ref or tag strings.

132

#

133

idnum ::= ':' bigint;

134

path ::= # GIT style file path, e.g. "a/b/c";

135

ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";

136

tag ::= # GIT tag name, e.g. "FIREFOX_1_5";

137

sha1exp ::= # Any valid GIT SHA1 expression;

138

hexsha1 ::= # SHA1 in hexadecimal format;

139

140

# note: name and email are UTF8 strings, however name must not

141

# contain '<' or lf and email must not contain any of the

142

# following: '<', '>', lf.

143

#

144

name ::= # valid GIT author/committer name;

145

email ::= # valid GIT author/committer email;

146

ts ::= # time since the epoch in seconds, ascii base10 notation;

147

tz ::= # GIT style timezone;

148

149

# note: comments may appear anywhere in the input, except

150

# within a data command. Any form of the data command

151

# always escapes the related input from comment processing.

152

#

153

# In case it is not clear, the '#' that starts the comment

154

# must be the first character on that the line (an lf have

155

# preceeded it).

156

#

157

comment ::= '#' not_lf* lf;

158

not_lf ::= # Any byte that is not ASCII newline (LF);

159

"""

160

161

162

import re

163

import sys

164

165

import commands

166

import dates

167

import errors

168

169

170

## Stream parsing ##

171

172

class LineBasedParser(object):

173

174

def __init__(self, input):

175

"""A Parser that keeps track of line numbers.

176

177

:param input: the file-like object to read from

178

"""

179

self.input = input

180

self.lineno = 0

181

# Lines pushed back onto the input stream

182

self._buffer = []

183

184

def abort(self, exception, *args):

185

"""Raise an exception providing line number information."""

186

raise exception(self.lineno, *args)

187

188

def readline(self):

189

"""Get the next line including the newline or '' on EOF."""

190

self.lineno += 1

191

if self._buffer:

192

return self._buffer.pop()

193

else:

194

return self.input.readline()

195

196

def next_line(self):

197

"""Get the next line without the newline or None on EOF."""

198

line = self.readline()

199

if line:

200

return line[:-1]

201

else:

202

return None

203

204

def push_line(self, line):

205

"""Push line back onto the line buffer.

206

207

:param line: the line with no trailing newline

208

"""

209

self.lineno -= 1

210

self._buffer.append(line + "\n")

211

212

def read_bytes(self, count):

213

"""Read a given number of bytes from the input stream.

214

215

Throws MissingBytes if the bytes are not found.

216

217

Note: This method does not read from the line buffer.

218

"""

219

lines = []

220

left = count

221

while left > 0:

222

line = self.input.readline(left)

223

if line:

224

left -= len(line)

225

lines.append(line)

226

else:

227

left = 0

228

result = ''.join(lines)

229

found = len(result)

230

if found != count:

231

self.abort(errors.MissingBytes, count, found)

232

return result

233

234

def read_until(self, terminator):

235

"""Read the input stream until the terminator is found.

236

237

Throws MissingTerminator if the terminator is not found.

238

239

Note: This method does not read from the line buffer.

240

241

:return: the bytes read up to but excluding the terminator.

242

"""

243

raise NotImplementedError(self.read_until)

244

245

246

# Regular expressions used for parsing

247

_WHO_AND_WHEN_RE = re.compile(r'(\w+) <(.+)> (.+)')

248

249

250

class ImportParser(LineBasedParser):

251

252

def __init__(self, input, verbose=False, output=sys.stdout):

253

"""A Parser of import commands.

254

255

:param input: the file-like object to read from

256

:param verbose: display extra information of not

257

:param output: the file-like object to write messages to (YAGNI?)

258

"""

259

LineBasedParser.__init__(self, input)

260

self.verbose = verbose

261

self.output = output

262

# We auto-detect the date format when a date is first encountered

263

self.date_parser = None

264

265

def iter_commands(self):

266

"""Iterator returning ImportCommand objects."""

267

while True:

268

line = self.next_line()

269

if line is None:

270

break

271

elif len(line) == 0 or line.startswith('#'):

272

continue

273

# Search for commands in order of likelihood

274

elif line.startswith('commit '):

275

yield self._parse_commit(line[len('commit '):])

276

elif line.startswith('blob'):

277

yield self._parse_blob()

278

elif line.startswith('progress '):

279

yield commands.ProgressCommand(line[len('progress '):])

280

elif line.startswith('reset '):

281

yield self._parse_reset(line[len('reset '):])

282

elif line.startswith('tag '):

283

yield self._parse_tag(line[len('tag '):])

284

elif line.startswith('checkpoint'):

285

yield commands.CheckpointCommand()

286

else:

287

self.abort(errors.InvalidCommand, line)

288

289

def iter_file_commands(self):

290

"""Iterator returning FileCommand objects.

291

292

If an invalid file command is found, the line is silently

293

pushed back and iteration ends.

294

"""

295

while True:

296

line = self.next_line()

297

if line is None:

298

break

299

elif len(line) == 0 or line.startswith('#'):

300

continue

301

# Search for file commands in order of likelihood

302

elif line.startswith('M '):

303

yield self._parse_file_modify(line[2:])

304

elif line.startswith('D '):

305

path = self._path(line[2:])

306

yield commands.FileDeleteCommand(path)

307

elif line.startswith('R '):

308

old, new = self._path_pair(line[2:])

309

yield commands.FileRenameCommand(old, new)

310

elif line.startswith('C '):

311

src, dest = self._path_pair(line[2:])

312

yield commands.FileRenameCommand(src, dest)

313

elif line.startswith('deleteall'):

314

yield commands.FileDeleteAllCommand()

315

else:

316

self.push_line(line)

317

break

318

319

def _parse_blob(self):

320

"""Parse a blob command."""

321

mark = self._get_mark_if_any()

322

data = self._get_data('blob')

323

return commands.BlobCommand(mark, data)

324

325

def _parse_commit(self, ref):

326

"""Parse a commit command."""

327

mark = self._get_mark_if_any()

328

author = self._get_user_info('commit', 'author', False)

329

committer = self._get_user_info('commit', 'committer')

330

message = self._get_data('commit', 'message')

331

from_ = self._get_from()

332

if from_ is not None:

333

parents = [from_]

334

while True:

335

merge = self._get_merge()

336

if merge is not None:

337

parents.append(merge)

338

else:

339

break

340

else:

341

parents = []

342

return commands.CommitCommand(ref, mark, author, committer, message,

343

parents, self.iter_file_commands)

344

345

def _parse_file_modify(self, info):

346

"""Parse a filemodify command within a commit.

347

348

:param info: a string in the format "mode dataref path"

349

(where dataref might be the hard-coded literal 'inline').

350

"""

351

params = info.split(' ', 2)

352

path = self._path(params[2])

353

is_executable, is_symlink = self._mode(params[0])

354

if is_symlink:

355

kind = commands.SYMLINK_KIND

356

else:

357

kind = commands.FILE_KIND

358

if params[1] == 'inline':

359

dataref = None

360

data = self._get_data('filemodify')

361

else:

362

dataref = params[1]

363

data = None

364

return commands.FileModifyCommand(path, kind, is_executable, dataref,

365

data)

366

367

def _parse_reset(self, ref):

368

"""Parse a reset command."""

369

from_ = self._get_from()

370

return commands.ResetCommand(ref, from_)

371

372

def _parse_tag(self, name):

373

"""Parse a tag command."""

374

from_ = self._get_from('tag')

375

tagger = self._get_user_info('tag', 'tagger')

376

message = self._get_data('tag', 'message')

377

return commands.TagCommand(name, from_, tagger, message)

378

379

def _get_mark_if_any(self):

380

"""Parse a mark section."""

381

line = self.next_line()

382

if line.startswith('mark :'):

383

return line[len('mark :'):]

384

else:

385

self.push_line(line)

386

return None

387

388

def _get_from(self, required_for=None):

389

"""Parse a from section."""

390

line = self.next_line()

391

if line.startswith('from '):

392

return line[len('from '):]

393

elif required_for:

394

self.abort(errors.MissingSection, required_for, 'from')

395

else:

396

self.push_line(line)

397

return None

398

399

def _get_merge(self):

400

"""Parse a merge section."""

401

line = self.next_line()

402

if line.startswith('merge '):

403

return line[len('merge '):]

404

else:

405

self.push_line(line)

406

return None

407

408

def _get_user_info(self, cmd, section, required=True):

409

"""Parse a user section."""

410

line = self.next_line()

411

if line.startswith(section + ' '):

412

return self._who_when(line[len(section + ' '):], cmd, section)

413

elif required:

414

self.abort(errors.MissingSection, cmd, section)

415

else:

416

self.push_line(line)

417

return None

418

419

def _get_data(self, required_for, section='data'):

420

"""Parse a data section."""

421

line = self.next_line()

422

if line.startswith('data '):

423

rest = line[len('data '):]

424

if rest.startswith('<<'):

425

return self.read_until(rest[2:])

426

else:

427

size = int(rest)

428

return self.read_bytes(size)

429

else:

430

self.abort(errors.MissingSection, required_for, section)

431

432

def _who_when(self, s, cmd, section):

433

"""Parse who and when information from a string.

434

435

:return: a tuple of (who,email,when) where who and

436

email are strings and when is a datetime object

437

"""

438

match = _WHO_AND_WHEN_RE.search(s)

439

if match:

440

datestr = match.group(3)

441

if self.date_parser is None:

442

# auto-detect the date format

443

if len(datestr) == 16:

444

format = 'raw'

445

elif datestr == 'now':

446

format = 'now'

447

else:

448

format = 'rfc2822'

449

self.date_parser = dates.DATE_PARSERS_BY_NAME[format]

450

when = self.date_parser(datestr)

451

return (match.group(1), match.group(2), when)

452

else:

453

self.abort(errors.BadFormat, cmd, section, s)

454

455

def _path(self, s):

456

"""Parse a path."""

457

# TODO: handle quoted paths

458

return s

459

460

def _path_pair(self, s):

461

"""Parse two paths separated by a space."""

462

# TODO: handle quoted paths

463

return tuple(s.split(' ', 1))

464

465

def _mode(self, s):

466

"""Parse a file mode into executable and symlink flags.

467

468

:return (is_executable, is_symlink)

469

"""

470

# Note: Output from git-fast-export slightly different to spec

471

if s in ['644', '100644', '0100644']:

472

return False, False

473

elif s in ['755', '100755', '0100755']:

474

return True, False

475

elif s == '120000':

476

return False, True

477

else:

478

self.abort(errors.BadFormat, 'filemodify', 'mode', s)

479