2
# -*- Mode: Python; coding: utf-8; indent-tabs-mode: nil; tab-width: 4 -*-
4
# Copyright (C) 2011 David Planella <david.planella@ubuntu.com>
5
# This program is free software: you can redistribute it and/or modify it
6
# under the terms of the GNU General Public License version 3, as published
7
# by the Free Software Foundation.
9
# This program is distributed in the hope that it will be useful, but
10
# WITHOUT ANY WARRANTY; without even the implied warranties of
11
# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
12
# PURPOSE. See the GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License along
15
# with this program. If not, see <http://www.gnu.org/licenses/>.
18
# This module implements a class to extract translatable messages from
19
# different types of files and put them into a Gettext POT file ready to give
20
# to translators to do their work. At this point only extracting messages
21
# from HTML files has been implemented.
26
from HTMLParser import HTMLParser
27
from traceback import print_exc
28
from sys import stderr
29
from translate_html import translate_htmlconfig
32
from datetime import datetime
37
sys.stderr.write('You need the Python Polib library to run this ' +
38
'script.\nYou can install it by running:\n\t' +
39
'$ sudo apt-get install python-polib')
41
# MIME type definitions (type, encoding)
42
HTML_FILE = ('text/html', None)
43
JS_FILE = ('application/javascript', None)
46
PO_FOLDER = translate_htmlconfig.PO_FOLDER
47
GETTEXT_DOMAIN = translate_htmlconfig.GETTEXT_DOMAIN
48
POTFILES = translate_htmlconfig.POTFILES
51
class HTMLStringParser(HTMLParser):
52
"""This class does the actual extraction from messages from HTML files.
53
HTML entities are generally not supported, the only exception being
56
return a Python set containing the extracted text
60
HTMLParser.__init__(self)
62
self.entityseen = False
65
def handle_data(self, data):
67
if (len(text) > 0) and not self.skiptag and (text != BOM):
68
text = sub('[ \t\r\n]+', ' ', text)
69
if not self.entityseen:
70
self._text.append(text)
72
entity = self._text.pop()
73
self._text[-1] += ' ' + entity + ' ' + text
74
self.entityseen = False
76
def handle_starttag(self, tag, attrs):
79
elif tag == 'noscript':
82
def handle_endtag(self, tag):
85
elif tag == 'noscript':
88
def handle_entityref(self, name):
89
# We only support & for now
91
self.handle_data('&' + name + ';')
92
self.entityseen = True
95
return set(self._text)
98
class StringExtractor(object):
99
"""This class reads the list of files to extract strings from from the
100
POTFILES.in file, performs the extraction and saves the POT file to disk.
104
self.files = self._load_files()
105
self.potfile = polib.POFile()
106
time_str = datetime.now().isoformat(' ')
107
self.potfile.metadata = {
108
'Project-Id-Version': '1.0',
109
'Report-Msgid-Bugs-To': 'you@example.com',
110
'POT-Creation-Date': time_str,
111
'PO-Revision-Date': time_str,
112
'Last-Translator': 'you <you@example.com>',
113
'Language-Team': 'English <yourteam@example.com>',
114
'MIME-Version': '1.0',
115
'Content-Type': 'text/plain; charset=utf-8',
116
'Content-Transfer-Encoding': '8bit',
119
def _load_files(self):
120
"""Loads the files to extract strings from. They are expected to
121
be listed in the POFILES.in file"""
122
with open(translate_htmlconfig.get_source_file(PO_FOLDER,
125
for line in fp.readlines():
126
if not line.startswith('#'):
128
translate_htmlconfig.get_sources_path(), line)
129
file_list.append(line.strip())
132
def _save_potfile(self):
133
"""Writes the resulting POT file to disk"""
134
self.potfile.save(os.path.join(
135
translate_htmlconfig.get_sources_path(),
137
GETTEXT_DOMAIN + '.pot'))
140
"""Extracts the messages from the given file by choosing the
141
appropriate extractor type, and saves the POT file to disk"""
142
for file_to_extract in self.files:
143
extractor = getExtractor(self.potfile, file_to_extract)
148
class StringExtractorJs(object):
149
"""This class implements the extractor from messages in JavaScript files
150
It is currently not supported.
153
def __init__(self, potfile, jsfile):
155
self.potfile = potfile
159
jsfile_rel = self.jsfile.replace(
160
translate_htmlconfig.get_sources_path(), '..')
161
with codecs.open(self.jsfile, 'r', 'utf-8') as fp:
163
for line in fp.readlines():
165
if line.startswith('var'):
166
var, message = line.split('=', 1)
168
message = message.strip()
169
message = message[1:-2]
171
entry = polib.POEntry(
173
occurrences=[(jsfile_rel, linecount)],
176
self.potfile.append(entry)
179
class StringExtractorHtml(object):
180
"""This class implements the extractor from messages in HTML files.
181
It reads the given HTML file and puts the extracted messages in a
185
def __init__(self, potfile, htmlfile):
186
self.htmlfile = htmlfile
187
self.potfile = potfile
190
htmlfile_rel = self.htmlfile.replace(
191
translate_htmlconfig.get_sources_path(), '..')
193
with codecs.open(self.htmlfile, 'r', 'utf-8') as fp:
194
html_file = fp.read()
195
extractor = HTMLStringParser()
196
extractor.feed(html_file)
198
messages = extractor.text()
200
for message in messages:
201
entry = polib.POEntry(
202
occurrences=[(htmlfile_rel, 0)],
205
self.potfile.append(entry)
207
print_exc(file=stderr)
210
class StringExtractorNone(object):
211
"""Dummy message extractor
214
def __init__(self, potfile, path):
221
def getExtractor(potfile, path):
222
"""Factory-like function to guess the type of file to extract translations
223
from by its MIME type, and return the appropriate extractor class to
227
# Guess the type of the given file
228
filetype, encoding = mimetypes.guess_type(path)
230
# Return the appropriate extractor class to handle the type
231
if (filetype, encoding) == HTML_FILE:
232
return StringExtractorHtml(potfile, path)
233
elif (filetype, encoding) == JS_FILE:
234
return StringExtractorJs(potfile, path)
236
return StringExtractorNone(potfile, path)