1
# Copyright (C) 2008 Canonical Ltd
3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License as published by
5
# the Free Software Foundation; either version 2 of the License, or
6
# (at your option) any later version.
8
# This program is distributed in the hope that it will be useful,
9
# but WITHOUT ANY WARRANTY; without even the implied warranty of
10
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
# GNU General Public License for more details.
13
# You should have received a copy of the GNU General Public License
14
# along with this program; if not, write to the Free Software
15
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17
"""Parser of import data into command objects.
19
In order to reuse existing front-ends, the stream format is a subset of
20
the one used by git-fast-import (as of the 1.5.4 release of git at least).
33
new_blob ::= 'blob' lf
36
file_content ::= data;
38
new_commit ::= 'commit' sp ref_str lf
40
('author' sp name '<' email '>' when lf)?
41
'committer' sp name '<' email '>' when lf
43
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
44
('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
49
file_change ::= file_clr
55
file_clr ::= 'deleteall' lf;
56
file_del ::= 'D' sp path_str lf;
57
file_rnm ::= 'R' sp path_str sp path_str lf;
58
file_cpy ::= 'C' sp path_str sp path_str lf;
59
file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
60
file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
63
new_tag ::= 'tag' sp tag_str lf
64
'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
65
'tagger' sp name '<' email '>' when lf
69
reset_branch ::= 'reset' sp ref_str lf
70
('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
73
checkpoint ::= 'checkpoint' lf
76
progress ::= 'progress' sp not_lf* lf
79
# note: the first idnum in a stream should be 1 and subsequent
80
# idnums should not have gaps between values as this will cause
81
# the stream parser to reserve space for the gapped values. An
82
# idnum can be updated in the future to a new object by issuing
83
# a new mark directive with the old idnum.
85
mark ::= 'mark' sp idnum lf;
86
data ::= (delimited_data | exact_data)
89
# note: delim may be any string but must not contain lf.
90
# data_line may contain any data but must not be exactly
91
# delim. The lf after the final data_line is included in
93
delimited_data ::= 'data' sp '<<' delim lf
97
# note: declen indicates the length of binary_data in bytes.
98
# declen does not include the lf preceeding the binary data.
100
exact_data ::= 'data' sp declen lf
103
# note: quoted strings are C-style quoting supporting \c for
104
# common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
105
# is the signed byte value in octal. Note that the only
106
# characters which must actually be escaped to protect the
107
# stream formatting is: \, " and LF. Otherwise these values
111
sha1exp_str ::= sha1exp;
113
path_str ::= path | '"' quoted(path) '"' ;
114
mode ::= '100644' | '644'
119
declen ::= # unsigned 32 bit value, ascii base10 notation;
120
bigint ::= # unsigned integer value, ascii base10 notation;
121
binary_data ::= # file content, not interpreted;
123
when ::= raw_when | rfc2822_when;
124
raw_when ::= ts sp tz;
125
rfc2822_when ::= # Valid RFC 2822 date and time;
127
sp ::= # ASCII space character;
128
lf ::= # ASCII newline (LF) character;
130
# note: a colon (':') must precede the numerical value assigned to
131
# an idnum. This is to distinguish it from a ref or tag name as
132
# GIT does not permit ':' in ref or tag strings.
134
idnum ::= ':' bigint;
135
path ::= # GIT style file path, e.g. "a/b/c";
136
ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
137
tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
138
sha1exp ::= # Any valid GIT SHA1 expression;
139
hexsha1 ::= # SHA1 in hexadecimal format;
141
# note: name and email are UTF8 strings, however name must not
142
# contain '<' or lf and email must not contain any of the
143
# following: '<', '>', lf.
145
name ::= # valid GIT author/committer name;
146
email ::= # valid GIT author/committer email;
147
ts ::= # time since the epoch in seconds, ascii base10 notation;
148
tz ::= # GIT style timezone;
150
# note: comments may appear anywhere in the input, except
151
# within a data command. Any form of the data command
152
# always escapes the related input from comment processing.
154
# In case it is not clear, the '#' that starts the comment
155
# must be the first character on that the line (an lf have
158
comment ::= '#' not_lf* lf;
159
not_lf ::= # Any byte that is not ASCII newline (LF);
173
class LineBasedParser(object):
175
def __init__(self, input):
176
"""A Parser that keeps track of line numbers.
178
:param input: the file-like object to read from
182
# Lines pushed back onto the input stream
185
def abort(self, exception, *args):
186
"""Raise an exception providing line number information."""
187
raise exception(self.lineno, *args)
190
"""Get the next line including the newline or '' on EOF."""
193
return self._buffer.pop()
195
return self.input.readline()
198
"""Get the next line without the newline or None on EOF."""
199
line = self.readline()
205
def push_line(self, line):
206
"""Push line back onto the line buffer.
208
:param line: the line with no trailing newline
211
self._buffer.append(line + "\n")
213
def read_bytes(self, count):
214
"""Read a given number of bytes from the input stream.
216
Throws MissingBytes if the bytes are not found.
218
Note: This method does not read from the line buffer.
222
result = self.input.read(count)
224
self.lineno += result.count("\n")
226
self.abort(errors.MissingBytes, count, found)
229
def read_until(self, terminator):
230
"""Read the input stream until the terminator is found.
232
Throws MissingTerminator if the terminator is not found.
234
Note: This method does not read from the line buffer.
236
:return: the bytes read up to but excluding the terminator.
240
term = terminator + '\n'
242
line = self.input.readline()
247
return ''.join(lines)
250
# Regular expression used for parsing. (Note: The spec states that the name
251
# part should be non-empty but git-fast-export doesn't always do that so
252
# the first bit is \w*, not \w+.) Also git-fast-import code says the
253
# space before the email is optional.
254
_WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')
255
_WHO_RE = re.compile(r'([^<]*)<(.*)>')
258
class ImportParser(LineBasedParser):
260
def __init__(self, input, verbose=False, output=sys.stdout,
262
"""A Parser of import commands.
264
:param input: the file-like object to read from
265
:param verbose: display extra information of not
266
:param output: the file-like object to write messages to (YAGNI?)
267
:param user_mapper: if not None, the UserMapper used to adjust
268
user-ids for authors, committers and taggers.
270
LineBasedParser.__init__(self, input)
271
self.verbose = verbose
273
self.user_mapper = user_mapper
274
# We auto-detect the date format when a date is first encountered
275
self.date_parser = None
277
def warning(self, msg):
278
sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
280
def iter_commands(self):
281
"""Iterator returning ImportCommand objects."""
283
line = self.next_line()
286
elif len(line) == 0 or line.startswith('#'):
288
# Search for commands in order of likelihood
289
elif line.startswith('commit '):
290
yield self._parse_commit(line[len('commit '):])
291
elif line.startswith('blob'):
292
yield self._parse_blob()
293
elif line.startswith('progress '):
294
yield commands.ProgressCommand(line[len('progress '):])
295
elif line.startswith('reset '):
296
yield self._parse_reset(line[len('reset '):])
297
elif line.startswith('tag '):
298
yield self._parse_tag(line[len('tag '):])
299
elif line.startswith('checkpoint'):
300
yield commands.CheckpointCommand()
301
elif line.startswith('feature'):
302
yield self._parse_feature(line[len('feature '):])
304
self.abort(errors.InvalidCommand, line)
306
def iter_file_commands(self):
307
"""Iterator returning FileCommand objects.
309
If an invalid file command is found, the line is silently
310
pushed back and iteration ends.
313
line = self.next_line()
316
elif len(line) == 0 or line.startswith('#'):
318
# Search for file commands in order of likelihood
319
elif line.startswith('M '):
320
yield self._parse_file_modify(line[2:])
321
elif line.startswith('D '):
322
path = self._path(line[2:])
323
yield commands.FileDeleteCommand(path)
324
elif line.startswith('R '):
325
old, new = self._path_pair(line[2:])
326
yield commands.FileRenameCommand(old, new)
327
elif line.startswith('C '):
328
src, dest = self._path_pair(line[2:])
329
yield commands.FileCopyCommand(src, dest)
330
elif line.startswith('deleteall'):
331
yield commands.FileDeleteAllCommand()
336
def _parse_blob(self):
337
"""Parse a blob command."""
339
mark = self._get_mark_if_any()
340
data = self._get_data('blob')
341
return commands.BlobCommand(mark, data, lineno)
343
def _parse_commit(self, ref):
344
"""Parse a commit command."""
346
mark = self._get_mark_if_any()
347
author = self._get_user_info('commit', 'author', False)
350
another_author = self._get_user_info('commit', 'author', False)
351
if another_author is not None:
352
more_authors.append(another_author)
355
committer = self._get_user_info('commit', 'committer')
356
message = self._get_data('commit', 'message')
358
message = message.decode('utf_8')
359
except UnicodeDecodeError:
361
"commit message not in utf8 - replacing unknown characters")
362
message = message.decode('utf_8', 'replace')
363
from_ = self._get_from()
366
merge = self._get_merge()
367
if merge is not None:
368
# while the spec suggests it's illegal, git-fast-export
369
# outputs multiple merges on the one line, e.g.
371
these_merges = merge.split(" ")
372
merges.extend(these_merges)
377
name_value = self._get_property()
378
if name_value is not None:
379
name, value = name_value
380
properties[name] = value
383
return commands.CommitCommand(ref, mark, author, committer, message,
384
from_, merges, self.iter_file_commands, lineno=lineno,
385
more_authors=more_authors, properties=properties)
387
def _parse_feature(self, info):
388
"""Parse a feature command."""
389
parts = info.split("=", 1)
392
value = self._path(parts[1])
395
return commands.FeatureCommand(name, value, lineno=self.lineno)
397
def _parse_file_modify(self, info):
398
"""Parse a filemodify command within a commit.
400
:param info: a string in the format "mode dataref path"
401
(where dataref might be the hard-coded literal 'inline').
403
params = info.split(' ', 2)
404
path = self._path(params[2])
405
is_executable, kind = self._mode(params[0])
406
if params[1] == 'inline':
408
data = self._get_data('filemodify')
412
return commands.FileModifyCommand(path, kind, is_executable, dataref,
415
def _parse_reset(self, ref):
416
"""Parse a reset command."""
417
from_ = self._get_from()
418
return commands.ResetCommand(ref, from_)
420
def _parse_tag(self, name):
421
"""Parse a tag command."""
422
from_ = self._get_from('tag')
423
tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
424
message = self._get_data('tag', 'message').decode('utf_8')
425
return commands.TagCommand(name, from_, tagger, message)
427
def _get_mark_if_any(self):
428
"""Parse a mark section."""
429
line = self.next_line()
430
if line.startswith('mark :'):
431
return line[len('mark :'):]
436
def _get_from(self, required_for=None):
437
"""Parse a from section."""
438
line = self.next_line()
441
elif line.startswith('from '):
442
return line[len('from '):]
444
self.abort(errors.MissingSection, required_for, 'from')
449
def _get_merge(self):
450
"""Parse a merge section."""
451
line = self.next_line()
454
elif line.startswith('merge '):
455
return line[len('merge '):]
460
def _get_property(self):
461
"""Parse a property section."""
462
line = self.next_line()
465
elif line.startswith('property '):
466
return self._name_value(line[len('property '):])
471
def _get_user_info(self, cmd, section, required=True,
472
accept_just_who=False):
473
"""Parse a user section."""
474
line = self.next_line()
475
if line.startswith(section + ' '):
476
return self._who_when(line[len(section + ' '):], cmd, section,
477
accept_just_who=accept_just_who)
479
self.abort(errors.MissingSection, cmd, section)
484
def _get_data(self, required_for, section='data'):
485
"""Parse a data section."""
486
line = self.next_line()
487
if line.startswith('data '):
488
rest = line[len('data '):]
489
if rest.startswith('<<'):
490
return self.read_until(rest[2:])
493
read_bytes = self.read_bytes(size)
494
# optional LF after data.
495
next = self.input.readline()
497
if len(next) > 1 or next != "\n":
498
self.push_line(next[:-1])
501
self.abort(errors.MissingSection, required_for, section)
503
def _who_when(self, s, cmd, section, accept_just_who=False):
504
"""Parse who and when information from a string.
506
:return: a tuple of (name,email,timestamp,timezone). name may be
507
the empty string if only an email address was given.
509
match = _WHO_AND_WHEN_RE.search(s)
511
datestr = match.group(3).lstrip()
512
if self.date_parser is None:
513
# auto-detect the date format
514
if len(datestr.split(' ')) == 2:
516
elif datestr == 'now':
520
self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
522
when = self.date_parser(datestr, self.lineno)
524
print "failed to parse datestr '%s'" % (datestr,)
527
match = _WHO_RE.search(s)
528
if accept_just_who and match:
529
# HACK around missing time
530
# TODO: output a warning here
531
when = dates.DATE_PARSERS_BY_NAME['now']('now')
533
self.abort(errors.BadFormat, cmd, section, s)
534
name = match.group(1)
538
name = name[:-1].decode('utf_8')
539
except UnicodeDecodeError:
540
# The spec says names are *typically* utf8 encoded
541
# but that isn't enforced by git-fast-export (at least)
542
self.warning("%s name not in utf8 - replacing unknown "
543
"characters" % (section,))
544
name = name[:-1].decode('utf_8', 'replace')
545
email = match.group(2)
546
# While it shouldn't happen, some datasets have email addresses
547
# which contain unicode characters. See bug 338186. We sanitize
548
# the data at this level just in case.
550
email = email.decode('utf_8')
551
except UnicodeDecodeError:
552
self.warning("%s email not in utf8 - replacing unknown characters"
554
email = email.decode('utf_8', 'replace')
556
name, email = self.user_mapper.map_name_and_email(name, email)
557
return (name, email, when[0], when[1])
559
def _name_value(self, s):
560
"""Parse a (name,value) tuple from 'name value-length value'."""
561
parts = s.split(' ', 2)
568
still_to_read = size - len(value)
569
if still_to_read > 0:
570
read_bytes = self.read_bytes(still_to_read)
571
value += "\n" + read_bytes[:still_to_read - 1]
572
value = value.decode('utf8')
577
if s.startswith('"'):
579
self.abort(errors.BadFormat, '?', '?', s)
581
return _unquote_c_string(s[1:-1])
583
return s.decode('utf_8')
584
except UnicodeDecodeError:
585
# The spec recommends utf8 encoding but that isn't enforced
588
def _path_pair(self, s):
589
"""Parse two paths separated by a space."""
590
# TODO: handle a space in the first path
591
if s.startswith('"'):
592
parts = s[1:].split('" ', 1)
594
parts = s.split(' ', 1)
596
self.abort(errors.BadFormat, '?', '?', s)
597
elif parts[1].startswith('"') and parts[1].endswith('"'):
598
parts[1] = parts[1][1:-1]
599
elif parts[1].startswith('"') or parts[1].endswith('"'):
600
self.abort(errors.BadFormat, '?', '?', s)
601
return map(_unquote_c_string, parts)
604
"""Parse a file mode into executable and kind.
606
:return (is_executable, kind)
608
# Note: Output from git-fast-export slightly different to spec
609
if s in ['644', '100644', '0100644']:
610
return False, commands.FILE_KIND
611
elif s in ['755', '100755', '0100755']:
612
return True, commands.FILE_KIND
613
elif s in ['040000', '0040000']:
614
return False, commands.DIRECTORY_KIND
615
elif s in ['120000', '0120000']:
616
return False, commands.SYMLINK_KIND
617
elif s in ['160000', '0160000']:
618
return False, commands.TREE_REFERENCE_KIND
620
self.abort(errors.BadFormat, 'filemodify', 'mode', s)
623
def _unquote_c_string(s):
624
"""replace C-style escape sequences (\n, \", etc.) with real chars."""
625
# HACK: Python strings are close enough
626
return s.decode('string_escape', 'replace')