~onboard/onboard/trunk : revision 2226.1.3

1

#!/usr/bin/python3

2

# -*- coding: utf-8 -*-

3

4

5

#

6

# This file is part of Onboard.

7

#

8

# Onboard is free software; you can redistribute it and/or modify

9

# it under the terms of the GNU General Public License as published by

10

# the Free Software Foundation; either version 3 of the License, or

11

# (at your option) any later version.

12

#

13

# Onboard is distributed in the hope that it will be useful,

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16

# GNU General Public License for more details.

17

#

18

# You should have received a copy of the GNU General Public License

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

20

21

22

import os

23

import sys

24

import locale

25

from bisect import bisect_left

26

from contextlib import contextmanager

27

from xml.dom import minidom

28

from urllib.request import urlopen

29

import re

30

from collections import OrderedDict

31

32

33

class UnicodeData:

34

35

UNICODE_DATA_PATH = "unicode_data"

36

37

class CodePoint:

38

"""

39

Class representing a single unicode code point.

40

"""

41

code = 0

42

43

# General_category of UnicodeData.txt, English/untranslated

44

category = ""

45

46

# iso_10646_comment_field of UnicodeData.txt, English/untranslated

47

comment = ""

48

49

def __init__(self):

50

self.annotations = {}

51

52

def __str__(self):

53

return ('UnicodeData({},{},{})'

54

.format(repr(self.code),

55

repr(self.category),

56

repr(self.comment),

57

))

58

59

class Annotation:

60

""" Annotation of a single entry in CLDR/common/annotations/*.xml """

61

# The code sequence can consist of one or more code-points.

62

# If there are multiple glyphs, they are apparently always joined

63

# by \u200d, the "zero width joiner" character.

64

code_sequence = ""

65

66

annotations = () # tuple of keywords, translated

67

annotation_tts = "" # text-to-speech description, translated

68

69

def __str__(self):

70

return ('Annotation({},{},{})'

71

.format(repr(self.code_sequence),

72

repr(self.annotations),

73

repr(self.annotation_tts)))

74

75

def __init__(self):

76

self._lang_ids = []

77

78

# All code points, sorted by code point

79

self._code_points = [] # list of code point objects

80

self._code_points_index = [] # list of integers for faster bisect

81

82

self._annotations = {}

83

84

# Emoji

85

# emoji-default: those expected to have an emoji presentation by

86

# default, but can also have a text presentation

87

# text-default: those expected to have a text presentation by default,

88

# but could also have an emoji presentation

89

# text-only: those that should only have a text presentation

90

self._emoji_default = set()

91

self._text_default = set()

92

93

# emoji that can be modified with a preceding modifier character

94

self._emoji_with_modifier = set()

95

96

# self.gen_unicode_data()

97

98

def set_language_ids(self, lang_ids):

99

"""

100

Set multiple lang_ids to be active at the same time.

101

"""

102

self._lang_ids = lang_ids

103

self._load_annotations(self._lang_ids)

104

105

def get_code_point(self, code):

106

"""

107

Doctests:

108

>>> ud = UnicodeData()

109

>>> lang_id = "en_US"

110

>>> ud.set_language_ids([lang_id])

111

>>> ud.get_annotation_for_sequence("🤦", lang_id) #doctest: +ELLIPSIS

112

<...

113

"""

114

a = self._code_points_index

115

i = bisect_left(a, code)

116

if i != len(a) and a[i] == code:

117

return self._code_points[i]

118

return None

119

120

def get_emojis_with_emoji_default(self):

121

return (self.get_code_point(code) for code in self._emoji_default)

122

123

def get_annotation_for_sequence(self, code_sequence, lang_id):

124

"""

125

Doctests:

126

>>> ud = UnicodeData()

127

>>> str(ud.get_code_point(ord('👩')))

128

"UnicodeData(128105,'WOMAN','')"

129

"""

130

annotations = self._annotations.get(lang_id)

131

if annotations:

132

return annotations.get(code_sequence)

133

return None

134

135

def _load_annotations(self, lang_ids):

136

self._annotations.clear()

137

138

for lang_id in lang_ids:

139

for base_name in self._get_cldr_locale_base_names(lang_id):

140

path = self._cldr_path('common/annotations',

141

base_name + ".xml")

142

143

if os.path.exists(path):

144

# {code_sequence : Annotation()}

145

annotations = self._annotations.setdefault(lang_id,

146

OrderedDict())

147

148

self._load_annotation_file(path, annotations)

149

150

def _load_annotation_file(self, path, annotations_out):

151

with self._parse_xml(path) as dom:

152

for node in dom.getElementsByTagName("annotation"):

153

text = "".join([n.data for n in node.childNodes

154

if n.nodeType == n.TEXT_NODE])

155

cp = self._get_attribute(node, 'cp', "-1")

156

157

a = annotations_out.setdefault(cp, self.Annotation())

158

a.code_sequence = cp

159

if self._get_attribute(node, 'type', "") == 'tts':

160

a.annotation_tts = text

161

else:

162

a.annotations = tuple(s.strip()

163

for s in text.split("|"))

164

165

def _get_cldr_locale_base_names(self, lang_id):

166

"""

167

Doctests:

168

>>> ud = UnicodeData()

169

>>> ud._get_cldr_locale_base_names('en_DE')

170

['en', 'en_001', 'en_150', 'en_DE']

171

>>> ud._get_cldr_locale_base_names('de_DE')

172

['de', 'de_DE']

173

"""

174

parent_locales = {}

175

path = self._cldr_path('common/supplemental', 'supplementalData.xml')

176

with self._parse_xml(path) as dom:

177

for node in dom.getElementsByTagName("parentLocale"):

178

parent = node.attributes["parent"].value

179

locales = node.attributes["locales"].value

180

locale_ids = locales.split()

181

for lid in locale_ids:

182

parent_locales[lid] = parent

183

184

# Find all annotation files we have to load for this

185

# particular lang_id. There can be multiple parent locales,

186

# e.g. en_DE -> en_150 -> en_001, then en.

187

candidates = [] # annotations files to load, in root to child order

188

candidates.append(lang_id)

189

lid = lang_id

190

while True:

191

lid = parent_locales.get(lid)

192

if not lid:

193

break

194

candidates.insert(0, lid)

195

196

lang_code, country_code = self.split_lang_id(lang_id)

197

if lang_code not in candidates:

198

candidates.insert(0, lang_code)

199

200

return candidates

201

202

@contextmanager

203

def _parse_xml(self, path):

204

with open(path, "r", encoding="UTF-8") as f:

205

with minidom.parse(f).documentElement as dom:

206

yield dom

207

208

@staticmethod

209

def _get_attribute(node, attribute, default):

210

attr = node.attributes.get(attribute)

211

return attr.value if attr else default

212

213

@staticmethod

214

def split_lang_id(lang_id):

215

tokens = lang_id.split("_")

216

lang_code = tokens[0] if len(tokens) >= 1 else ""

217

country_code = tokens[1] if len(tokens) >= 2 else ""

218

return lang_code, country_code

219

220

def gen_unicode_data(self):

221

"""

222

Download UNICODE tables and generate data files to include in

223

the project.

224

225

Note: this is a build-time step, and even then, this has to

226

be repeated only when updated unicode tables are released.

227

"""

228

# block names, English

229

lines = self._read_cached_http(

230

'http://www.unicode.org/Public/UNIDATA/Blocks.txt',

231

'UNIDATA', 'Blocks.txt')

232

for line in lines:

233

if line:

234

line = line.split("#")[0].strip()

235

236

# code points

237

lines = self._read_cached_http(

238

'http://www.unicode.org/Public/UNIDATA/UnicodeData.txt',

239

'UNIDATA', 'UnicodeData.txt')

240

for line in lines:

241

if line:

242

line = line.split("#")[0].strip()

243

if line:

244

fields = line.split(";")

245

(code_value,

246

general_category,

247

canonical_Combining_classes,

248

bidirectional_category,

249

character_decomposition_mapping,

250

decimal_digit_value,

251

digit_value,

252

numeric_value,

253

mirrored_bidi,

254

unicode_1_0_name,

255

iso_10646_comment_field,

256

uppercase_mapping,

257

lowercase_mapping,

258

titlecase_mapping,

259

unknown

260

) = fields

261

262

cp = self.CodePoint()

263

cp.code = code_value

264

cp.category = general_category

265

cp.comment = iso_10646_comment_field

266

self._code_points.append(cp)

267

self._code_points_index.append(code_value)

268

269

# emoji-data.txt knows which characters are:

270

# - emoji (Emoji),

271

# - presentation emoji (Emoji_Presentation)

272

# - emoji modifiers (Emoji_Modifier_Base).

273

lines = self._read_cached_http(

274

'http://unicode.org/Public/emoji/3.0/emoji-data.txt',

275

'emoji', 'emoji-data.txt')

276

for line in lines:

277

if line:

278

line = line.split("#")[0].strip()

279

if line:

280

fields = [c.strip() for c in line.split(";")]

281

if len(fields) >= 2:

282

code_point_range = re.split('\.\.', fields[0])

283

code_point_range = list(int(e, 16)

284

for e in code_point_range)

285

flag = fields[1].lower()

286

287

s = None

288

if flag == "Emoji".lower():

289

s = self._emoji_default

290

elif flag == "Emoji_Presentation".lower():

291

s = self._text_default

292

elif flag == "Emoji_Modifier_Base".lower():

293

s = self._emoji_with_modifier

294

295

if s is not None:

296

if len(code_point_range) == 2:

297

for i in range(code_point_range[0],

298

code_point_range[1] + 1):

299

s.add(i)

300

elif len(code_point_range) == 1:

301

s.add(code_point_range[0])

302

303

def _read_cached_http(self, url, subdir, fn):

304

lines = []

305

path = self._get_http_file(url, subdir, fn)

306

with open(path, "r", encoding="UTF-8") as f:

307

lines = f.readlines()

308

return lines

309

310

def _get_http_file(self, url, subdir, fn):

311

dir_ = self._data_path(subdir)

312

path = os.path.join(dir_, fn)

313

314

if not os.path.exists(dir_):

315

os.makedirs(dir_)

316

317

if not os.path.exists(path):

318

319

print("Downloading '{}'... ".format(url))

320

sys.stdout.flush()

321

322

response = urlopen(url)

323

data = response.read()

324

text = data.decode('utf-8')

325

with open(path, "w", encoding="UTF-8") as f:

326

f.write(text)

327

328

print(" saved as '{}'".format(path))

329

330

return path

331

332

def _cldr_path(self, subdir, fn):

333

""" path of CLDR directory """

334

cldr_subdir = os.path.join('CLDR', subdir)

335

dir_ = self._data_path(cldr_subdir)

336

return os.path.join(dir_, fn)

337

338

def _data_path(self, fn):

339

""" path of unicode_data directory """

340

return os.path.join(self.UNICODE_DATA_PATH, fn)

341

342

def gen_emoji_output(self):

343

# categories: [category label, starting codepoint, comment]

344

categories = [

345

["😀", "😀", "Smileys", False],

346

["👦", "👦", "People", False],

347

["❤", "💋", "Emotion", False],

348

["🐱", "🐵", "Animals & Nature", False],

349

["🍒", "🍇", "Food & Drink", False],

350

["🏛", "🌍", "Travel & Places", False],

351

["⚽", "🎃", "Activities", False],

352

["🔔", "🔇", "Objects", False],

353

["🔶", "🏧", "Symbols", False],

354

["🏁", "🏁", "Flags", False],

355

]

356

357

print("#")

358

print("# Generated for Onboard by " + os.path.basename(__file__))

359

print("#")

360

print()

361

print("emoji_data = [")

362

363

emoji_data = self._read_emoji_data().items()

364

comment_row = 50

365

366

for codepoints, data in emoji_data:

367

alternatives, comment = data

368

369

# new category?

370

new_category_index = -1

371

for i, category in enumerate(categories):

372

if len(codepoints) == 1 and \

373

codepoints[0] == ord(category[1]):

374

new_category_index = i

375

category[3] = True # mark as found, for later checks

376

break

377

378

# category header

379

if new_category_index > 0:

380

print(" " * 8 + "]],")

381

382

if new_category_index >= 0:

383

category = categories[new_category_index]

384

clabel = category[0]

385

ccomment = category[2]

386

387

line = " " * 4 + "[" + repr(clabel) + ", "

388

line = line.ljust(comment_row) + \

389

"# category: " + ccomment

390

print(line)

391

print(" " * 8 + "[")

392

393

new_category_index = -1

394

395

# main emoji

396

line = " " * 12 + "(" + \

397

repr("".join([chr(cp) for cp in codepoints])) + ", " + \

398

("None), " if not alternatives else "")

399

line = line.ljust(comment_row) + "# " + comment

400

print(line)

401

402

# skin tones for the long-press popup

403

for i, (acodepoints, acomment) in enumerate(alternatives):

404

line = " " * 16 + \

405

("(" if i == 0 else " ") +\

406

repr("".join([chr(cp) for cp in acodepoints])) + \

407

(")), " if i == len(alternatives) - 1 else ", ")

408

line = line.ljust(comment_row) + "# " + acomment

409

print(line)

410

411

print(" " * 8 + "]],")

412

print("]")

413

414

# plausibility check: have all categories been found?

415

if not all(category[3] for category in categories):

416

print()

417

print("Warning: emoji categories were not all used")

418

for category in categories:

419

print(" " * 4 + str(category))

420

421

def _read_emoji_data(self):

422

lines = self._read_cached_http(

423

'http://unicode.org/emoji/charts/emoji-ordering.txt',

424

'emoji', 'emoji-ordering.txt')

425

426

skincolors = [0x1F3FB,

427

0x1F3FC,

428

0x1F3FD,

429

0x1F3FE,

430

0x1F3FF,

431

]

432

433

last_noskin_cps = ()

434

emoji_data = OrderedDict()

435

436

for line in lines:

437

if line:

438

k = line.find("#")

439

data = line[:k].strip()

440

comment = line[k + 1:].strip()

441

if data:

442

fields = [c.strip() for c in data.split(";")]

443

codepoints = tuple(int(c.strip().replace("U+", "0x"), 16)

444

for c in fields[0].split())

445

446

# Emoji modified with skin color get hidden in

447

# long-press popups.

448

# The emoji of the popup is the immediate predecessor of

449

# continues skin color modifier sequences.

450

noskincps = tuple(cp for cp in codepoints

451

if cp not in skincolors)

452

if codepoints == noskincps:

453

last_noskin_cps = codepoints

454

emoji_data[codepoints] = [[], comment]

455

456

elif not noskincps: # just a skin modifier alone?

457

print("dropping emoji: skin modifier", repr(comment),

458

file=sys.stderr)

459

460

else:

461

# print(codepoints, comment, last_noskin_comment)

462

parent = emoji_data.get(last_noskin_cps)

463

if parent:

464

parent[0].append([codepoints, comment])

465

else:

466

print("dropping emoji: no parent for",

467

repr(comment), file=sys.stderr)

468

469

return emoji_data

470

471

472

def get_system_default_lang_id(self):

473

lang_id = locale.getdefaultlocale()[0]

474

if not lang_id: # None e.g. with LANG=C

475

lang_id = "en_US"

476

return lang_id

477

478

479

if __name__ == "__main__":

480

ud = UnicodeData()

481

ud.gen_emoji_output()

482

483

exit()

484

485

# Download stuff now (first time), so the output doesn't

486

# interfere with doctests.

487

UnicodeData()

488

489

import doctest

490

doctest.testmod()

491

if 1:

492

ud = UnicodeData()

493

ud.set_language_ids(["en_US"])

494

495

if 0:

496

cps = ud.get_emoji_sequences()

497

cps = ud.get_emojis_with_emoji_default()

498

for cp in cps:

499

print(cp.code, cp.category, cp.comment,

500

cp.annotations, cp.annotation_tts)

501

502