1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Parser of import data into command objects.
19
In order to reuse existing front-ends, the stream format is a subset of
20
the one used by git-fast-import (as of the 1.5.4 release of git at least).
33
new_blob ::= 'blob' lf
36
file_content ::= data;
38
new_commit ::= 'commit' sp ref_str lf
40
('author' sp name '<' email '>' when lf)?
41
'committer' sp name '<' email '>' when lf
43
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
44
('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
49
file_change ::= file_clr
55
file_clr ::= 'deleteall' lf;
56
file_del ::= 'D' sp path_str lf;
57
file_rnm ::= 'R' sp path_str sp path_str lf;
58
file_cpy ::= 'C' sp path_str sp path_str lf;
59
file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
60
file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
63
new_tag ::= 'tag' sp tag_str lf
64
'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
65
'tagger' sp name '<' email '>' when lf
69
reset_branch ::= 'reset' sp ref_str lf
70
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
73
checkpoint ::= 'checkpoint' lf
76
progress ::= 'progress' sp not_lf* lf
79
# note: the first idnum in a stream should be 1 and subsequent
80
# idnums should not have gaps between values as this will cause
81
# the stream parser to reserve space for the gapped values. An
82
# idnum can be updated in the future to a new object by issuing
83
# a new mark directive with the old idnum.
85
mark ::= 'mark' sp idnum lf;
86
data ::= (delimited_data | exact_data)
89
# note: delim may be any string but must not contain lf.
90
# data_line may contain any data but must not be exactly
92
delimited_data ::= 'data' sp '<<' delim lf
96
# note: declen indicates the length of binary_data in bytes.
97
# declen does not include the lf preceeding the binary data.
99
exact_data ::= 'data' sp declen lf
102
# note: quoted strings are C-style quoting supporting \c for
103
# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
104
# is the signed byte value in octal. Note that the only
105
# characters which must actually be escaped to protect the
106
# stream formatting is: \, " and LF. Otherwise these values
110
sha1exp_str ::= sha1exp;
112
path_str ::= path | '"' quoted(path) '"' ;
113
mode ::= '100644' | '644'
118
declen ::= # unsigned 32 bit value, ascii base10 notation;
119
bigint ::= # unsigned integer value, ascii base10 notation;
120
binary_data ::= # file content, not interpreted;
122
when ::= raw_when | rfc2822_when;
123
raw_when ::= ts sp tz;
124
rfc2822_when ::= # Valid RFC 2822 date and time;
126
sp ::= # ASCII space character;
127
lf ::= # ASCII newline (LF) character;
129
# note: a colon (':') must precede the numerical value assigned to
130
# an idnum. This is to distinguish it from a ref or tag name as
131
# GIT does not permit ':' in ref or tag strings.
133
idnum ::= ':' bigint;
134
path ::= # GIT style file path, e.g. "a/b/c";
135
ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
136
tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
137
sha1exp ::= # Any valid GIT SHA1 expression;
138
hexsha1 ::= # SHA1 in hexadecimal format;
140
# note: name and email are UTF8 strings, however name must not
141
# contain '<' or lf and email must not contain any of the
142
# following: '<', '>', lf.
144
name ::= # valid GIT author/committer name;
145
email ::= # valid GIT author/committer email;
146
ts ::= # time since the epoch in seconds, ascii base10 notation;
147
tz ::= # GIT style timezone;
149
# note: comments may appear anywhere in the input, except
150
# within a data command. Any form of the data command
151
# always escapes the related input from comment processing.
153
# In case it is not clear, the '#' that starts the comment
154
# must be the first character on that the line (an lf have
157
comment ::= '#' not_lf* lf;
158
not_lf ::= # Any byte that is not ASCII newline (LF);
172
class LineBasedParser(object):
174
def __init__(self, input):
175
"""A Parser that keeps track of line numbers.
177
:param input: the file-like object to read from
181
# Lines pushed back onto the input stream
184
def abort(self, exception, *args):
185
"""Raise an exception providing line number information."""
186
raise exception(self.lineno, *args)
189
"""Get the next line including the newline or '' on EOF."""
192
return self._buffer.pop()
194
return self.input.readline()
197
"""Get the next line without the newline or None on EOF."""
198
line = self.readline()
204
def push_line(self, line):
205
"""Push line back onto the line buffer.
207
:param line: the line with no trailing newline
210
self._buffer.append(line + "\n")
212
def read_bytes(self, count):
213
"""Read a given number of bytes from the input stream.
215
Throws MissingBytes if the bytes are not found.
217
Note: This method does not read from the line buffer.
222
line = self.input.readline(left)
228
result = ''.join(lines)
231
self.abort(errors.MissingBytes, count, found)
234
def read_until(self, terminator):
235
"""Read the input stream until the terminator is found.
237
Throws MissingTerminator if the terminator is not found.
239
Note: This method does not read from the line buffer.
241
:return: the bytes read up to but excluding the terminator.
243
raise NotImplementedError(self.read_until)
246
# Regular expressions used for parsing
247
_WHO_AND_WHEN_RE = re.compile(r'(\w+) <(.+)> (.+)')
250
class ImportParser(LineBasedParser):
252
def __init__(self, input, verbose=False, output=sys.stdout):
253
"""A Parser of import commands.
255
:param input: the file-like object to read from
256
:param verbose: display extra information of not
257
:param output: the file-like object to write messages to (YAGNI?)
259
LineBasedParser.__init__(self, input)
260
self.verbose = verbose
262
# We auto-detect the date format when a date is first encountered
263
self.date_parser = None
265
def iter_commands(self):
266
"""Iterator returning ImportCommand objects."""
268
line = self.next_line()
271
elif len(line) == 0 or line.startswith('#'):
273
# Search for commands in order of likelihood
274
elif line.startswith('commit '):
275
yield self._parse_commit(line[len('commit '):])
276
elif line.startswith('blob'):
277
yield self._parse_blob()
278
elif line.startswith('progress '):
279
yield commands.ProgressCommand(line[len('progress '):])
280
elif line.startswith('reset '):
281
yield self._parse_reset(line[len('reset '):])
282
elif line.startswith('tag '):
283
yield self._parse_tag(line[len('tag '):])
284
elif line.startswith('checkpoint'):
285
yield commands.CheckpointCommand()
287
self.abort(errors.InvalidCommand, line)
289
def iter_file_commands(self):
290
"""Iterator returning FileCommand objects.
292
If an invalid file command is found, the line is silently
293
pushed back and iteration ends.
296
line = self.next_line()
299
elif len(line) == 0 or line.startswith('#'):
301
# Search for file commands in order of likelihood
302
elif line.startswith('M '):
303
yield self._parse_file_modify(line[2:])
304
elif line.startswith('D '):
305
path = self._path(line[2:])
306
yield commands.FileDeleteCommand(path)
307
elif line.startswith('R '):
308
old, new = self._path_pair(line[2:])
309
yield commands.FileRenameCommand(old, new)
310
elif line.startswith('C '):
311
src, dest = self._path_pair(line[2:])
312
yield commands.FileRenameCommand(src, dest)
313
elif line.startswith('deleteall'):
314
yield commands.FileDeleteAllCommand()
319
def _parse_blob(self):
320
"""Parse a blob command."""
321
mark = self._get_mark_if_any()
322
data = self._get_data('blob')
323
return commands.BlobCommand(mark, data)
325
def _parse_commit(self, ref):
326
"""Parse a commit command."""
327
mark = self._get_mark_if_any()
328
author = self._get_user_info('commit', 'author', False)
329
committer = self._get_user_info('commit', 'committer')
330
message = self._get_data('commit', 'message')
331
from_ = self._get_from()
332
if from_ is not None:
335
merge = self._get_merge()
336
if merge is not None:
337
parents.append(merge)
342
return commands.CommitCommand(ref, mark, author, committer, message,
343
parents, self.iter_file_commands)
345
def _parse_file_modify(self, info):
346
"""Parse a filemodify command within a commit.
348
:param info: a string in the format "mode dataref path"
349
(where dataref might be the hard-coded literal 'inline').
351
params = info.split(' ', 2)
352
path = self._path(params[2])
353
is_executable, is_symlink = self._mode(params[0])
355
kind = commands.SYMLINK_KIND
357
kind = commands.FILE_KIND
358
if params[1] == 'inline':
360
data = self._get_data('filemodify')
364
return commands.FileModifyCommand(path, kind, is_executable, dataref,
367
def _parse_reset(self, ref):
368
"""Parse a reset command."""
369
from_ = self._get_from()
370
return commands.ResetCommand(ref, from_)
372
def _parse_tag(self, name):
373
"""Parse a tag command."""
374
from_ = self._get_from('tag')
375
tagger = self._get_user_info('tag', 'tagger')
376
message = self._get_data('tag', 'message')
377
return commands.TagCommand(name, from_, tagger, message)
379
def _get_mark_if_any(self):
380
"""Parse a mark section."""
381
line = self.next_line()
382
if line.startswith('mark :'):
383
return line[len('mark :'):]
388
def _get_from(self, required_for=None):
389
"""Parse a from section."""
390
line = self.next_line()
391
if line.startswith('from '):
392
return line[len('from '):]
394
self.abort(errors.MissingSection, required_for, 'from')
399
def _get_merge(self):
400
"""Parse a merge section."""
401
line = self.next_line()
402
if line.startswith('merge '):
403
return line[len('merge '):]
408
def _get_user_info(self, cmd, section, required=True):
409
"""Parse a user section."""
410
line = self.next_line()
411
if line.startswith(section + ' '):
412
return self._who_when(line[len(section + ' '):], cmd, section)
414
self.abort(errors.MissingSection, cmd, section)
419
def _get_data(self, required_for, section='data'):
420
"""Parse a data section."""
421
line = self.next_line()
422
if line.startswith('data '):
423
rest = line[len('data '):]
424
if rest.startswith('<<'):
425
return self.read_until(rest[2:])
428
return self.read_bytes(size)
430
self.abort(errors.MissingSection, required_for, section)
432
def _who_when(self, s, cmd, section):
433
"""Parse who and when information from a string.
435
:return: a tuple of (who,email,when) where who and
436
email are strings and when is a datetime object
438
match = _WHO_AND_WHEN_RE.search(s)
440
datestr = match.group(3)
441
if self.date_parser is None:
442
# auto-detect the date format
443
if len(datestr) == 16:
445
elif datestr == 'now':
449
self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
450
when = self.date_parser(datestr)
451
return (match.group(1), match.group(2), when)
453
self.abort(errors.BadFormat, cmd, section, s)
457
# TODO: handle quoted paths
460
def _path_pair(self, s):
461
"""Parse two paths separated by a space."""
462
# TODO: handle quoted paths
463
return tuple(s.split(' ', 1))
466
"""Parse a file mode into executable and symlink flags.
468
:return (is_executable, is_symlink)
470
# Note: Output from git-fast-export slightly different to spec
471
if s in ['644', '100644', '0100644']:
473
elif s in ['755', '100755', '0100755']:
478
self.abort(errors.BadFormat, 'filemodify', 'mode', s)