2
# Originally written by Barry Warsaw <barry@zope.com>
4
# Minimally patched to make it even more xgettext compatible
5
# by Peter Funk <pf@artcom-gmbh.de>
7
"""pygettext -- Python equivalent of xgettext(1)
9
Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
10
internationalization of C programs. Most of these tools are independent of
11
the programming language and can be used from within Python programs. Martin
12
von Loewis' work[1] helps considerably in this regard.
14
There's one problem though; xgettext is the program that scans source code
15
looking for message strings, but it groks only C (or C++). Python introduces
16
a few wrinkles, such as dual quoting characters, triple quoted strings, and
17
raw strings. xgettext understands none of this.
19
Enter pygettext, which uses Python's standard tokenize module to scan Python
20
source code, generating .pot files identical to what GNU xgettext[2] generates
21
for C and C++ code. From there, the standard GNU tools can be used.
23
A word about marking Python strings as candidates for translation. GNU
24
xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
25
gettext_noop. But those can be a lot of text to include all over your code.
26
C and C++ have a trick: they use the C preprocessor. Most internationalized C
27
source includes a #define for gettext() to _() so that what has to be written
28
in the source is much less. Thus these are both translatable strings:
30
gettext("Translatable String")
31
_("Translatable String")
33
Python of course has no preprocessor so this doesn't work so well. Thus,
34
pygettext searches only for _() by default, but see the -k/--keyword flag
35
below for how to augment this.
37
[1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
38
[2] http://www.gnu.org/software/gettext/gettext.html
40
NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
41
where ever possible. However some options are still missing or are not fully
42
implemented. Also, xgettext's use of command line switches with option
43
arguments is broken, and in these cases, pygettext just defines additional
46
Usage: pygettext [options] inputfile ...
56
Rename the default output file from messages.pot to name.pot.
60
Replace non-ASCII characters with octal escape sequences.
64
Extract module, class, method, and function docstrings. These do not
65
need to be wrapped in _() markers, and in fact cannot be for Python to
66
consider them docstrings. (See also the -X option).
70
Print this help message and exit.
74
Keywords to look for in addition to the default set, which are:
77
You can have multiple -k flags on the command line.
81
Disable the default set of keywords (see above). Any keywords
82
explicitly added with the -k/--keyword option are still recognized.
85
Do not write filename/lineno location comments.
89
Write filename/lineno location comments indicating where each
90
extracted string is found in the source. These lines appear before
91
each msgid. The style of comments is controlled by the -S/--style
92
option. This is the default.
96
Rename the default output file from messages.pot to filename. If
97
filename is `-' then the output is sent to standard out.
101
Output files will be placed in directory dir.
105
Specify which style to use for location comments. Two styles are
108
Solaris # File: filename, line: line-number
111
The style name is case insensitive. GNU style is the default.
115
Print the names of the files being processed.
119
Print the version of pygettext and exit.
123
Set width of output to columns.
126
--exclude-file=filename
127
Specify a file that contains a list of strings that are not be
128
extracted from the input files. Each string to be excluded must
129
appear on a line by itself in the file.
132
--no-docstrings=filename
133
Specify a file that contains a list of files (one per line) that
134
should not have their docstrings extracted. This is only useful in
135
conjunction with the -D option above.
137
If `inputfile' is -, standard input is read.
156
default_keywords = ['_']
157
DEFAULTKEYWORDS = ', '.join(default_keywords)
163
# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
166
# SOME DESCRIPTIVE TITLE.
167
# Copyright (C) YEAR ORGANIZATION
168
# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
172
"Project-Id-Version: PACKAGE VERSION\\n"
173
"POT-Creation-Date: %(time)s\\n"
174
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
175
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
176
"Language-Team: LANGUAGE <LL@li.org>\\n"
177
"MIME-Version: 1.0\\n"
178
"Content-Type: text/plain; charset=CHARSET\\n"
179
"Content-Transfer-Encoding: ENCODING\\n"
180
"Generated-By: pygettext.py %(version)s\\n"
185
def usage(code, msg=''):
186
print >> sys.stderr, _(__doc__) % globals()
188
print >> sys.stderr, msg
195
def make_escapes(pass_iso8859):
198
# Allow iso-8859 characters to pass through so that e.g. 'msgid
199
# "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we
200
# escape any character outside the 32..126 range.
205
if 32 <= (i % mod) <= 126:
206
escapes.append(chr(i))
208
escapes.append("\\%03o" % i)
209
escapes[ord('\\')] = '\\\\'
210
escapes[ord('\t')] = '\\t'
211
escapes[ord('\r')] = '\\r'
212
escapes[ord('\n')] = '\\n'
213
escapes[ord('\"')] = '\\"'
219
for i in range(len(s)):
220
s[i] = escapes[ord(s[i])]
221
return EMPTYSTRING.join(s)
225
# unwrap quotes, safely
226
return eval(s, {'__builtins__':{}}, {})
230
# This converts the various Python string types into a format that is
231
# appropriate for .po files, namely much closer to C style.
232
lines = s.split('\n')
234
s = '"' + escape(s) + '"'
238
lines[-1] = lines[-1] + '\n'
239
for i in range(len(lines)):
240
lines[i] = escape(lines[i])
242
s = '""\n"' + lineterm.join(lines) + '"'
248
def __init__(self, options):
249
self.__options = options
251
self.__state = self.__waiting
254
self.__freshmodule = 1
255
self.__curfile = None
257
def __call__(self, ttype, tstring, stup, etup, line):
260
## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
261
## 'tstring:', tstring
262
self.__state(ttype, tstring, stup[0])
264
def __waiting(self, ttype, tstring, lineno):
265
opts = self.__options
266
# Do docstring extractions, if enabled
267
if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
269
if self.__freshmodule:
270
if ttype == tokenize.STRING:
271
self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
272
self.__freshmodule = 0
273
elif ttype not in (tokenize.COMMENT, tokenize.NL):
274
self.__freshmodule = 0
277
if ttype == tokenize.NAME and tstring in ('class', 'def'):
278
self.__state = self.__suiteseen
280
if ttype == tokenize.NAME and tstring in opts.keywords:
281
self.__state = self.__keywordseen
283
def __suiteseen(self, ttype, tstring, lineno):
284
# ignore anything until we see the colon
285
if ttype == tokenize.OP and tstring == ':':
286
self.__state = self.__suitedocstring
288
def __suitedocstring(self, ttype, tstring, lineno):
289
# ignore any intervening noise
290
if ttype == tokenize.STRING:
291
self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
292
self.__state = self.__waiting
293
elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
295
# there was no class docstring
296
self.__state = self.__waiting
298
def __keywordseen(self, ttype, tstring, lineno):
299
if ttype == tokenize.OP and tstring == '(':
301
self.__lineno = lineno
302
self.__state = self.__openseen
304
self.__state = self.__waiting
306
def __openseen(self, ttype, tstring, lineno):
307
if ttype == tokenize.OP and tstring == ')':
308
# We've seen the last of the translatable strings. Record the
309
# line number of the first line of the strings and update the list
310
# of messages seen. Reset state for the next batch. If there
311
# were no strings inside _(), then just ignore this entry.
313
self.__addentry(EMPTYSTRING.join(self.__data))
314
self.__state = self.__waiting
315
elif ttype == tokenize.STRING:
316
self.__data.append(safe_eval(tstring))
317
# TBD: should we warn if we seen anything else?
319
def __addentry(self, msg, lineno=None, isdocstring=0):
321
lineno = self.__lineno
322
if not msg in self.__options.toexclude:
323
entry = (self.__curfile, lineno)
324
self.__messages.setdefault(msg, {})[entry] = isdocstring
326
def set_filename(self, filename):
327
self.__curfile = filename
328
self.__freshmodule = 1
331
options = self.__options
332
timestamp = time.ctime(time.time())
333
# The time stamp in the header doesn't have the same format as that
334
# generated by xgettext...
335
print >> fp, pot_header % {'time': timestamp, 'version': __version__}
336
# Sort the entries. First sort each particular entry's keys, then
337
# sort all the entries by their first item.
339
for k, v in self.__messages.items():
342
reverse.setdefault(tuple(keys), []).append((k, v))
343
rkeys = reverse.keys()
346
rentries = reverse[rkey]
348
for k, v in rentries:
350
# If the entry was gleaned out of a docstring, then add a
351
# comment stating so. This is to aid translators who may wish
352
# to skip translating some unimportant docstrings.
353
if reduce(operator.__add__, v.values()):
355
# k is the message string, v is a dictionary-set of (filename,
356
# lineno) tuples. We want to sort the entries in v first by
357
# file name and then by line number.
360
if not options.writelocations:
362
# location comments are different b/w Solaris and GNU:
363
elif options.locationstyle == options.SOLARIS:
364
for filename, lineno in v:
365
d = {'filename': filename, 'lineno': lineno}
367
'# File: %(filename)s, line: %(lineno)d') % d
368
elif options.locationstyle == options.GNU:
369
# fit as many locations on one line, as long as the
370
# resulting line length doesn't exceeds 'options.width'
372
for filename, lineno in v:
373
d = {'filename': filename, 'lineno': lineno}
374
s = _(' %(filename)s:%(lineno)d') % d
375
if len(locline) + len(s) <= options.width:
376
locline = locline + s
383
print >> fp, '#, docstring'
384
print >> fp, 'msgid', normalize(k)
385
print >> fp, 'msgstr ""\n'
390
global default_keywords
392
opts, args = getopt.getopt(
394
'ad:DEhk:Kno:p:S:Vvw:x:X:',
395
['extract-all', 'default-domain=', 'escape', 'help',
396
'keyword=', 'no-default-keywords',
397
'add-location', 'no-location', 'output=', 'output-dir=',
398
'style=', 'verbose', 'version', 'width=', 'exclude-file=',
399
'docstrings', 'no-docstrings',
401
except getopt.error, msg:
404
# for holding option values
410
extractall = 0 # FIXME: currently this option has no effect at all.
414
outfile = 'messages.pot'
424
locations = {'gnu' : options.GNU,
425
'solaris' : options.SOLARIS,
429
for opt, arg in opts:
430
if opt in ('-h', '--help'):
432
elif opt in ('-a', '--extract-all'):
433
options.extractall = 1
434
elif opt in ('-d', '--default-domain'):
435
options.outfile = arg + '.pot'
436
elif opt in ('-E', '--escape'):
438
elif opt in ('-D', '--docstrings'):
439
options.docstrings = 1
440
elif opt in ('-k', '--keyword'):
441
options.keywords.append(arg)
442
elif opt in ('-K', '--no-default-keywords'):
443
default_keywords = []
444
elif opt in ('-n', '--add-location'):
445
options.writelocations = 1
446
elif opt in ('--no-location',):
447
options.writelocations = 0
448
elif opt in ('-S', '--style'):
449
options.locationstyle = locations.get(arg.lower())
450
if options.locationstyle is None:
451
usage(1, _('Invalid value for --style: %s') % arg)
452
elif opt in ('-o', '--output'):
453
options.outfile = arg
454
elif opt in ('-p', '--output-dir'):
455
options.outpath = arg
456
elif opt in ('-v', '--verbose'):
458
elif opt in ('-V', '--version'):
459
print _('pygettext.py (xgettext for Python) %s') % __version__
461
elif opt in ('-w', '--width'):
463
options.width = int(arg)
465
usage(1, _('--width argument must be an integer: %s') % arg)
466
elif opt in ('-x', '--exclude-file'):
467
options.excludefilename = arg
468
elif opt in ('-X', '--no-docstrings'):
475
options.nodocstrings[line[:-1]] = 1
480
make_escapes(options.escape)
482
# calculate all keywords
483
options.keywords.extend(default_keywords)
485
# initialize list of strings to exclude
486
if options.excludefilename:
488
fp = open(options.excludefilename)
489
options.toexclude = fp.readlines()
492
print >> sys.stderr, _(
493
"Can't read --exclude-file: %s") % options.excludefilename
496
options.toexclude = []
498
# slurp through all the files
499
eater = TokenEater(options)
500
for filename in args:
503
print _('Reading standard input')
508
print _('Working on %s') % filename
512
eater.set_filename(filename)
514
tokenize.tokenize(fp.readline, eater)
515
except tokenize.TokenError, e:
516
print >> sys.stderr, '%s: %s, line %d, column %d' % (
517
e[0], filename, e[1][0], e[1][1])
523
if options.outfile == '-':
528
options.outfile = os.path.join(options.outpath, options.outfile)
529
fp = open(options.outfile, 'w')
538
if __name__ == '__main__':
540
# some more test strings
541
_(u'a unicode string')