2
# -*- coding: utf-8 -*-
4
# Copyright © 2017 marmuta <marmvta@gmail.com>
6
# This file is part of Onboard.
8
# Onboard is free software; you can redistribute it and/or modify
9
# it under the terms of the GNU General Public License as published by
10
# the Free Software Foundation; either version 3 of the License, or
11
# (at your option) any later version.
13
# Onboard is distributed in the hope that it will be useful,
14
# but WITHOUT ANY WARRANTY; without even the implied warranty of
15
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
# GNU General Public License for more details.
18
# You should have received a copy of the GNU General Public License
19
# along with this program. If not, see <http://www.gnu.org/licenses/>.
25
from bisect import bisect_left
26
from contextlib import contextmanager
27
from xml.dom import minidom
28
from urllib.request import urlopen
30
from collections import OrderedDict
35
UNICODE_DATA_PATH = "unicode_data"
39
Class representing a single unicode code point.
43
# General_category of UnicodeData.txt, English/untranslated
46
# iso_10646_comment_field of UnicodeData.txt, English/untranslated
53
return ('UnicodeData({},{},{})'
54
.format(repr(self.code),
60
""" Annotation of a single entry in CLDR/common/annotations/*.xml """
61
# The code sequence can consist of one or more code-points.
62
# If there are multiple glyphs, they are apparently always joined
63
# by \u200d, the "zero width joiner" character.
66
annotations = () # tuple of keywords, translated
67
annotation_tts = "" # text-to-speech description, translated
70
return ('Annotation({},{},{})'
71
.format(repr(self.code_sequence),
72
repr(self.annotations),
73
repr(self.annotation_tts)))
78
# All code points, sorted by code point
79
self._code_points = [] # list of code point objects
80
self._code_points_index = [] # list of integers for faster bisect
82
self._annotations = {}
85
# emoji-default: those expected to have an emoji presentation by
86
# default, but can also have a text presentation
87
# text-default: those expected to have a text presentation by default,
88
# but could also have an emoji presentation
89
# text-only: those that should only have a text presentation
90
self._emoji_default = set()
91
self._text_default = set()
93
# emoji that can be modified with a preceding modifier character
94
self._emoji_with_modifier = set()
96
# self.gen_unicode_data()
98
def set_language_ids(self, lang_ids):
100
Set multiple lang_ids to be active at the same time.
102
self._lang_ids = lang_ids
103
self._load_annotations(self._lang_ids)
105
def get_code_point(self, code):
108
>>> ud = UnicodeData()
109
>>> lang_id = "en_US"
110
>>> ud.set_language_ids([lang_id])
111
>>> ud.get_annotation_for_sequence("🤦", lang_id) #doctest: +ELLIPSIS
114
a = self._code_points_index
115
i = bisect_left(a, code)
116
if i != len(a) and a[i] == code:
117
return self._code_points[i]
120
def get_emojis_with_emoji_default(self):
121
return (self.get_code_point(code) for code in self._emoji_default)
123
def get_annotation_for_sequence(self, code_sequence, lang_id):
126
>>> ud = UnicodeData()
127
>>> str(ud.get_code_point(ord('👩')))
128
"UnicodeData(128105,'WOMAN','')"
130
annotations = self._annotations.get(lang_id)
132
return annotations.get(code_sequence)
135
def _load_annotations(self, lang_ids):
136
self._annotations.clear()
138
for lang_id in lang_ids:
139
for base_name in self._get_cldr_locale_base_names(lang_id):
140
path = self._cldr_path('common/annotations',
143
if os.path.exists(path):
144
# {code_sequence : Annotation()}
145
annotations = self._annotations.setdefault(lang_id,
148
self._load_annotation_file(path, annotations)
150
def _load_annotation_file(self, path, annotations_out):
151
with self._parse_xml(path) as dom:
152
for node in dom.getElementsByTagName("annotation"):
153
text = "".join([n.data for n in node.childNodes
154
if n.nodeType == n.TEXT_NODE])
155
cp = self._get_attribute(node, 'cp', "-1")
157
a = annotations_out.setdefault(cp, self.Annotation())
159
if self._get_attribute(node, 'type', "") == 'tts':
160
a.annotation_tts = text
162
a.annotations = tuple(s.strip()
163
for s in text.split("|"))
165
def _get_cldr_locale_base_names(self, lang_id):
168
>>> ud = UnicodeData()
169
>>> ud._get_cldr_locale_base_names('en_DE')
170
['en', 'en_001', 'en_150', 'en_DE']
171
>>> ud._get_cldr_locale_base_names('de_DE')
175
path = self._cldr_path('common/supplemental', 'supplementalData.xml')
176
with self._parse_xml(path) as dom:
177
for node in dom.getElementsByTagName("parentLocale"):
178
parent = node.attributes["parent"].value
179
locales = node.attributes["locales"].value
180
locale_ids = locales.split()
181
for lid in locale_ids:
182
parent_locales[lid] = parent
184
# Find all annotation files we have to load for this
185
# particular lang_id. There can be multiple parent locales,
186
# e.g. en_DE -> en_150 -> en_001, then en.
187
candidates = [] # annotations files to load, in root to child order
188
candidates.append(lang_id)
191
lid = parent_locales.get(lid)
194
candidates.insert(0, lid)
196
lang_code, country_code = self.split_lang_id(lang_id)
197
if lang_code not in candidates:
198
candidates.insert(0, lang_code)
203
def _parse_xml(self, path):
204
with open(path, "r", encoding="UTF-8") as f:
205
with minidom.parse(f).documentElement as dom:
209
def _get_attribute(node, attribute, default):
210
attr = node.attributes.get(attribute)
211
return attr.value if attr else default
214
def split_lang_id(lang_id):
215
tokens = lang_id.split("_")
216
lang_code = tokens[0] if len(tokens) >= 1 else ""
217
country_code = tokens[1] if len(tokens) >= 2 else ""
218
return lang_code, country_code
220
def gen_unicode_data(self):
222
Download UNICODE tables and generate data files to include in
225
Note: this is a build-time step, and even then, this has to
226
be repeated only when updated unicode tables are released.
228
# block names, English
229
lines = self._read_cached_http(
230
'http://www.unicode.org/Public/UNIDATA/Blocks.txt',
231
'UNIDATA', 'Blocks.txt')
234
line = line.split("#")[0].strip()
237
lines = self._read_cached_http(
238
'http://www.unicode.org/Public/UNIDATA/UnicodeData.txt',
239
'UNIDATA', 'UnicodeData.txt')
242
line = line.split("#")[0].strip()
244
fields = line.split(";")
247
canonical_Combining_classes,
248
bidirectional_category,
249
character_decomposition_mapping,
255
iso_10646_comment_field,
262
cp = self.CodePoint()
264
cp.category = general_category
265
cp.comment = iso_10646_comment_field
266
self._code_points.append(cp)
267
self._code_points_index.append(code_value)
269
# emoji-data.txt knows which characters are:
271
# - presentation emoji (Emoji_Presentation)
272
# - emoji modifiers (Emoji_Modifier_Base).
273
lines = self._read_cached_http(
274
'http://unicode.org/Public/emoji/3.0/emoji-data.txt',
275
'emoji', 'emoji-data.txt')
278
line = line.split("#")[0].strip()
280
fields = [c.strip() for c in line.split(";")]
282
code_point_range = re.split('\.\.', fields[0])
283
code_point_range = list(int(e, 16)
284
for e in code_point_range)
285
flag = fields[1].lower()
288
if flag == "Emoji".lower():
289
s = self._emoji_default
290
elif flag == "Emoji_Presentation".lower():
291
s = self._text_default
292
elif flag == "Emoji_Modifier_Base".lower():
293
s = self._emoji_with_modifier
296
if len(code_point_range) == 2:
297
for i in range(code_point_range[0],
298
code_point_range[1] + 1):
300
elif len(code_point_range) == 1:
301
s.add(code_point_range[0])
303
def _read_cached_http(self, url, subdir, fn):
305
path = self._get_http_file(url, subdir, fn)
306
with open(path, "r", encoding="UTF-8") as f:
307
lines = f.readlines()
310
def _get_http_file(self, url, subdir, fn):
311
dir_ = self._data_path(subdir)
312
path = os.path.join(dir_, fn)
314
if not os.path.exists(dir_):
317
if not os.path.exists(path):
319
print("Downloading '{}'... ".format(url))
322
response = urlopen(url)
323
data = response.read()
324
text = data.decode('utf-8')
325
with open(path, "w", encoding="UTF-8") as f:
328
print(" saved as '{}'".format(path))
332
def _cldr_path(self, subdir, fn):
333
""" path of CLDR directory """
334
cldr_subdir = os.path.join('CLDR', subdir)
335
dir_ = self._data_path(cldr_subdir)
336
return os.path.join(dir_, fn)
338
def _data_path(self, fn):
339
""" path of unicode_data directory """
340
return os.path.join(self.UNICODE_DATA_PATH, fn)
342
def gen_emoji_output(self):
343
# categories: [category label, starting codepoint, comment]
345
["😀", "😀", "Smileys", False],
346
["👦", "👦", "People", False],
347
["❤", "💋", "Emotion", False],
348
["🐱", "🐵", "Animals & Nature", False],
349
["🍒", "🍇", "Food & Drink", False],
350
["🏛", "🌍", "Travel & Places", False],
351
["⚽", "🎃", "Activities", False],
352
["🔔", "🔇", "Objects", False],
353
["🔶", "🏧", "Symbols", False],
354
["🏁", "🏁", "Flags", False],
358
print("# Generated for Onboard by " + os.path.basename(__file__))
361
print("emoji_data = [")
363
emoji_data = self._read_emoji_data().items()
366
for codepoints, data in emoji_data:
367
alternatives, comment = data
370
new_category_index = -1
371
for i, category in enumerate(categories):
372
if len(codepoints) == 1 and \
373
codepoints[0] == ord(category[1]):
374
new_category_index = i
375
category[3] = True # mark as found, for later checks
379
if new_category_index > 0:
380
print(" " * 8 + "]],")
382
if new_category_index >= 0:
383
category = categories[new_category_index]
385
ccomment = category[2]
387
line = " " * 4 + "[" + repr(clabel) + ", "
388
line = line.ljust(comment_row) + \
389
"# category: " + ccomment
393
new_category_index = -1
396
line = " " * 12 + "(" + \
397
repr("".join([chr(cp) for cp in codepoints])) + ", " + \
398
("None), " if not alternatives else "")
399
line = line.ljust(comment_row) + "# " + comment
402
# skin tones for the long-press popup
403
for i, (acodepoints, acomment) in enumerate(alternatives):
405
("(" if i == 0 else " ") +\
406
repr("".join([chr(cp) for cp in acodepoints])) + \
407
(")), " if i == len(alternatives) - 1 else ", ")
408
line = line.ljust(comment_row) + "# " + acomment
411
print(" " * 8 + "]],")
414
# plausibility check: have all categories been found?
415
if not all(category[3] for category in categories):
417
print("Warning: emoji categories were not all used")
418
for category in categories:
419
print(" " * 4 + str(category))
421
def _read_emoji_data(self):
422
lines = self._read_cached_http(
423
'http://unicode.org/emoji/charts/emoji-ordering.txt',
424
'emoji', 'emoji-ordering.txt')
426
skincolors = [0x1F3FB,
434
emoji_data = OrderedDict()
439
data = line[:k].strip()
440
comment = line[k + 1:].strip()
442
fields = [c.strip() for c in data.split(";")]
443
codepoints = tuple(int(c.strip().replace("U+", "0x"), 16)
444
for c in fields[0].split())
446
# Emoji modified with skin color get hidden in
448
# The emoji of the popup is the immediate predecessor of
449
# continues skin color modifier sequences.
450
noskincps = tuple(cp for cp in codepoints
451
if cp not in skincolors)
452
if codepoints == noskincps:
453
last_noskin_cps = codepoints
454
emoji_data[codepoints] = [[], comment]
456
elif not noskincps: # just a skin modifier alone?
457
print("dropping emoji: skin modifier", repr(comment),
461
# print(codepoints, comment, last_noskin_comment)
462
parent = emoji_data.get(last_noskin_cps)
464
parent[0].append([codepoints, comment])
466
print("dropping emoji: no parent for",
467
repr(comment), file=sys.stderr)
472
def get_system_default_lang_id(self):
473
lang_id = locale.getdefaultlocale()[0]
474
if not lang_id: # None e.g. with LANG=C
479
if __name__ == "__main__":
481
ud.gen_emoji_output()
485
# Download stuff now (first time), so the output doesn't
486
# interfere with doctests.
493
ud.set_language_ids(["en_US"])
496
cps = ud.get_emoji_sequences()
497
cps = ud.get_emojis_with_emoji_default()
499
print(cp.code, cp.category, cp.comment,
500
cp.annotations, cp.annotation_tts)