2
* this file is ported from kdelibs/kdeui/kcharselectdata.cpp
4
* original file is licensed under GPLv2+
10
#include <fcitx-utils/uthash.h>
11
#include <fcitx-utils/utils.h>
12
#include <fcitx-config/xdg.h>
13
#include <fcitx/fcitx.h>
14
#if defined(__linux__) || defined(__GLIBC__)
17
#include <sys/endian.h>
19
#include "charselectdata.h"
21
/* constants for hangul (de)composition, see UAX #15 */
29
#define NCount (VCount * TCount)
30
#define SCount (LCount * NCount)
31
#define HASH_FIND_UNICODE(head,findint,out) \
32
HASH_FIND(hh,head,findint,sizeof(uint16_t),out)
33
#define HASH_ADD_UNICODE(head,intfield,add) \
34
HASH_ADD(hh,head,intfield,sizeof(uint16_t),add)
36
typedef struct _UnicodeSet {
41
static const UT_icd int16_icd = { sizeof(int16_t), NULL, NULL, NULL };
43
static const char JAMO_L_TABLE[][4] = {
44
"G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
45
"S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
48
static const char JAMO_V_TABLE[][4] = {
49
"A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
50
"WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
54
static const char JAMO_T_TABLE[][4] = {
55
"", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
56
"LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
57
"S", "SS", "NG", "J", "C", "K", "T", "P", "H"
60
int uni_cmp(const void* a, const void* b) {
61
const UnicodeSet* sa = a;
62
const UnicodeSet* sb = b;
63
return sa->unicode - sb->unicode;
66
int pindex_cmp(const void* a, const void* b) {
67
CharSelectDataIndex* const* pa = a;
68
CharSelectDataIndex* const* pb = b;
70
return strcasecmp((*pa)->key, (*pb)->key);
73
int index_search_cmp(const void* a, const void* b) {
75
CharSelectDataIndex* const* pb = b;
77
return strcasecmp(s, (*pb)->key);
80
int index_search_a_cmp(const void* a, const void* b) {
82
CharSelectDataIndex* const* pb = b;
86
res = strncasecmp(s, (*pb)->key, len);
93
UT_array* SplitString(const char* s);
95
char* FormatCode(uint16_t code, int length, const char* prefix);
96
UnicodeSet* CharSelectDataGetMatchingChars(CharSelectData* charselect, const char* s);
98
uint32_t FromLittleEndian32(const char* d)
100
const uint8_t* data = (const uint8_t*) d;
102
memcpy(&t, data, sizeof(t));
106
uint16_t FromLittleEndian16(const char* d)
108
const uint8_t* data = (const uint8_t*) d;
110
memcpy(&t, data, sizeof(t));
114
CharSelectData* CharSelectDataCreate()
116
CharSelectData* charselect = fcitx_utils_new(CharSelectData);
120
FILE* fp = FcitxXDGGetFileWithPrefix("data", "charselectdata", "r", NULL);
124
fseek(fp, 0, SEEK_END);
125
long int size = ftell(fp);
126
fseek(fp, 0, SEEK_SET);
128
charselect->size = size;
129
charselect->dataFile = fcitx_utils_malloc0(size);
130
fread(charselect->dataFile, 1, size, fp);
134
CharSelectDataCreateIndex(charselect);
143
UT_array* CharSelectDataUnihanInfo(CharSelectData* charselect, uint16_t unicode)
145
UT_array* res = fcitx_utils_new_string_list();
147
const char* data = charselect->dataFile;
148
const uint32_t offsetBegin = FromLittleEndian32(data+36);
149
const uint32_t offsetEnd = charselect->size;
153
int max = ((offsetEnd - offsetBegin) / 30) - 1;
156
mid = (min + max) / 2;
157
const uint16_t midUnicode = FromLittleEndian16(data + offsetBegin + mid*30);
158
if (unicode > midUnicode)
160
else if (unicode < midUnicode)
164
for(i = 0; i < 7; i++) {
165
uint32_t offset = FromLittleEndian32(data + offsetBegin + mid*30 + 2 + i*4);
166
const char* empty = "";
168
const char* r = data + offset;
169
utarray_push_back(res, &r);
171
utarray_push_back(res, &empty);
181
uint32_t CharSelectDataGetDetailIndex(CharSelectData* charselect, uint16_t unicode)
183
const char* data = charselect->dataFile;
184
// Convert from little-endian, so that this code works on PPC too.
185
// http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286
186
const uint32_t offsetBegin = FromLittleEndian32(data+12);
187
const uint32_t offsetEnd = FromLittleEndian32(data+16);
191
int max = ((offsetEnd - offsetBegin) / 27) - 1;
193
static uint16_t most_recent_searched;
194
static uint32_t most_recent_result;
197
if (unicode == most_recent_searched)
198
return most_recent_result;
200
most_recent_searched = unicode;
203
mid = (min + max) / 2;
204
const uint16_t midUnicode = FromLittleEndian16(data + offsetBegin + mid*27);
205
if (unicode > midUnicode)
207
else if (unicode < midUnicode)
210
most_recent_result = offsetBegin + mid*27;
212
return most_recent_result;
216
most_recent_result = 0;
220
char* CharSelectDataName(CharSelectData* charselect, uint16_t unicode)
224
if ((unicode >= 0x3400 && unicode <= 0x4DB5)
225
|| (unicode >= 0x4e00 && unicode <= 0x9fa5)) {
226
// || (unicode >= 0x20000 && unicode <= 0x2A6D6) // useless, since limited to 16 bit
227
asprintf(&result, "CJK UNIFIED IDEOGRAPH-%x", unicode);
228
} else if (unicode >= 0xac00 && unicode <= 0xd7af) {
229
/* compute hangul syllable name as per UAX #15 */
230
int SIndex = unicode - SBase;
231
int LIndex, VIndex, TIndex;
233
if (SIndex < 0 || SIndex >= SCount) {
238
LIndex = SIndex / NCount;
239
VIndex = (SIndex % NCount) / TCount;
240
TIndex = SIndex % TCount;
242
fcitx_utils_alloc_cat_str(result, "HANGUL SYLLABLE ",
243
JAMO_L_TABLE[LIndex],
244
JAMO_V_TABLE[VIndex],
245
JAMO_T_TABLE[TIndex]);
246
} else if (unicode >= 0xD800 && unicode <= 0xDB7F)
247
result = strdup(_("<Non Private Use High Surrogate>"));
248
else if (unicode >= 0xDB80 && unicode <= 0xDBFF)
249
result = strdup(_("<Private Use High Surrogate>"));
250
else if (unicode >= 0xDC00 && unicode <= 0xDFFF)
251
result = strdup(_("<Low Surrogate>"));
252
else if (unicode >= 0xE000 && unicode <= 0xF8FF)
253
result = strdup(_("<Private Use>"));
256
const char* data = charselect->dataFile;
257
const uint32_t offsetBegin = FromLittleEndian32(data+4);
258
const uint32_t offsetEnd = FromLittleEndian32(data+8);
262
int max = ((offsetEnd - offsetBegin) / 6) - 1;
265
mid = (min + max) / 2;
266
const uint16_t midUnicode = FromLittleEndian16(data + offsetBegin + mid*6);
267
if (unicode > midUnicode)
269
else if (unicode < midUnicode)
272
uint32_t offset = FromLittleEndian32(data + offsetBegin + mid*6 + 2);
273
result = strdup(charselect->dataFile + offset + 1);
281
result = strdup(_("<not assigned>"));
286
char* Simplified(const char* src)
288
char* s = strdup(src);
312
int IsHexString(const char* s)
314
size_t l = strlen(s);
317
if (!((s[0] == '0' && s[1] == 'x')
318
|| (s[0] == '0' && s[1] == 'X')
319
|| (s[0] == 'u' && s[1] == '+')
320
|| (s[0] == 'U' && s[1] == '+'))) {
333
void UnicodeSetFree(UnicodeSet* set) {
341
UnicodeSet* UnicodeSetIntersect(UnicodeSet* left, UnicodeSet* right)
350
UnicodeSet* p = left;
352
UnicodeSet* find = NULL;
353
HASH_FIND_UNICODE(right, &p->unicode, find);
354
UnicodeSet* next = p->hh.next;
360
HASH_DEL(right, find);
367
UnicodeSetFree(right);
372
UnicodeSetFree(left);
375
UnicodeSetFree(right);
380
UT_array* CharSelectDataFind(CharSelectData* charselect, const char* needle)
382
UnicodeSet *result = NULL;
385
utarray_new(returnRes, &int16_icd);
386
char* simplified = Simplified(needle);
387
UT_array* searchStrings = SplitString(simplified);
389
if (strlen(simplified) == 1) {
390
// search for hex representation of the character
391
utarray_clear(searchStrings);
392
char* format = FormatCode(simplified[0], 4, "U+");
393
utarray_push_back(searchStrings, &format);
398
if (utarray_len(searchStrings) == 0) {
402
utarray_foreach(s, searchStrings, char*) {
404
if(IsHexString(*s)) {
406
uint16_t uni = (uint16_t) strtol(*s + 2, &end, 16);
407
utarray_push_back(returnRes, &uni);
409
// search for "1234" instead of "0x1234"
410
char* news = strdup(*s + 2);
414
// try to parse string as decimal number
416
int unicode = strtol(*s, &end, 10);
417
if (end == NULL && unicode >= 0 && unicode <= 0xFFFF) {
418
utarray_push_back(returnRes, &unicode);
422
int firstSubString = 1;
423
utarray_foreach(s2, searchStrings, char* ) {
424
UnicodeSet* partResult = CharSelectDataGetMatchingChars(charselect, *s2);
425
if (firstSubString) {
429
result = UnicodeSetIntersect(result, partResult);
435
// remove results found by matching the code point to prevent duplicate results
436
// while letting these characters stay at the beginning
437
utarray_foreach(c, returnRes, uint16_t) {
438
UnicodeSet* dup = NULL;
439
HASH_FIND_UNICODE(result, c, dup);
441
HASH_DEL(result, dup);
444
HASH_SORT(result, uni_cmp);
447
UnicodeSet* p = result;
449
utarray_push_back(returnRes, &p->unicode);
453
utarray_free(searchStrings);
458
UnicodeSet* InsertResult(UnicodeSet* set, uint16_t unicode) {
459
UnicodeSet* find = NULL;
460
HASH_FIND_UNICODE(set, &unicode, find);
462
find = fcitx_utils_new(UnicodeSet);
463
find->unicode = unicode;
464
HASH_ADD_UNICODE(set, unicode, find);
469
UnicodeSet* CharSelectDataGetMatchingChars(CharSelectData* charselect, const char* s)
471
UnicodeSet *result = NULL;
472
size_t s_l = strlen(s);
473
CharSelectDataIndex **pos;
474
CharSelectDataIndex **last;
475
pos = utarray_custom_bsearch(s, charselect->indexList, 0, index_search_cmp);
476
last = utarray_custom_bsearch(s, charselect->indexList,
477
0, index_search_a_cmp);
481
last = (CharSelectDataIndex**)utarray_back(charselect->indexList);
482
while (pos != last && strncasecmp(s, (*pos)->key, s_l) == 0) {
483
utarray_foreach (c, (*pos)->items, uint16_t) {
484
result = InsertResult(result, *c);
492
UT_array* CharSelectDataAliases(CharSelectData* charselect, uint16_t unicode)
494
const char* data = charselect->dataFile;
495
const int detailIndex = CharSelectDataGetDetailIndex(charselect, unicode);
496
if(detailIndex == 0) {
497
return fcitx_utils_new_string_list();
500
const uint8_t count = * (uint8_t *)(data + detailIndex + 6);
501
uint32_t offset = FromLittleEndian32(data + detailIndex + 2);
503
UT_array* aliases = fcitx_utils_new_string_list();
506
for (i = 0; i < count; i++) {
507
const char* r = data + offset;
508
utarray_push_back(aliases, &r);
509
offset += strlen(data + offset) + 1;
515
UT_array* CharSelectDataNotes(CharSelectData* charselect, uint16_t unicode)
517
const int detailIndex = CharSelectDataGetDetailIndex(charselect, unicode);
518
if(detailIndex == 0) {
519
return fcitx_utils_new_string_list();
522
const char* data = charselect->dataFile;
523
const uint8_t count = * (uint8_t *)(data + detailIndex + 11);
524
uint32_t offset = FromLittleEndian32(data + detailIndex + 7);
526
UT_array* notes = fcitx_utils_new_string_list();
529
for (i = 0; i < count; i++) {
530
const char* r = data + offset;
531
utarray_push_back(notes, &r);
532
offset += strlen(data + offset) + 1;
538
UT_array* CharSelectDataSeeAlso(CharSelectData* charselect, uint16_t unicode)
541
utarray_new(seeAlso, &int16_icd);
542
const int detailIndex = CharSelectDataGetDetailIndex(charselect, unicode);
543
if(detailIndex == 0) {
547
const char* data = charselect->dataFile;
548
const uint8_t count = * (uint8_t *)(data + detailIndex + 26);
549
uint32_t offset = FromLittleEndian32(data + detailIndex + 22);
552
for (i = 0; i < count; i++) {
553
uint16_t c = FromLittleEndian16 (data + offset);
554
utarray_push_back(seeAlso, &c);
561
UT_array* CharSelectDataEquivalents(CharSelectData* charselect, uint16_t unicode)
563
const int detailIndex = CharSelectDataGetDetailIndex(charselect, unicode);
564
if(detailIndex == 0) {
565
return fcitx_utils_new_string_list();
568
const char* data = charselect->dataFile;
569
const uint8_t count = * (uint8_t *)(data + detailIndex + 21);
570
uint32_t offset = FromLittleEndian32(data + detailIndex + 17);
572
UT_array* equivalents = fcitx_utils_new_string_list();
575
for (i = 0; i < count; i++) {
576
const char* r = data + offset;
577
utarray_push_back(equivalents, &r);
578
offset += strlen(data + offset) + 1;
584
UT_array* CharSelectDataApproximateEquivalents(CharSelectData* charselect, uint16_t unicode)
586
const int detailIndex = CharSelectDataGetDetailIndex(charselect, unicode);
587
if(detailIndex == 0) {
588
return fcitx_utils_new_string_list();
591
const char* data = charselect->dataFile;
592
const uint8_t count = * (uint8_t *)(data + detailIndex + 16);
593
uint32_t offset = FromLittleEndian32(data + detailIndex + 12);
595
UT_array* approxEquivalents = fcitx_utils_new_string_list();
598
for (i = 0; i < count; i++) {
599
const char* r = data + offset;
600
utarray_push_back(approxEquivalents, &r);
601
offset += strlen(data + offset) + 1;
604
return approxEquivalents;
608
char* FormatCode(uint16_t code, int length, const char* prefix)
612
asprintf(&fmt, "%%s%%0%dX", length);
613
asprintf(&s, fmt, prefix, code);
618
UT_array* SplitString(const char* s)
620
UT_array* result = fcitx_utils_new_string_list();
623
int length = strlen(s);
624
while (end < length) {
625
while (end < length && (isdigit(s[end]) || isalpha(s[end]) || s[end] == '+')) {
629
char* p = strndup(&s[start], end - start);
630
utarray_push_back(result, &p);
634
while (end < length && !(isdigit(s[end]) || isalpha(s[end]) || s[end] == '+')) {
642
CharSelectDataIndex* CharSelectDataIndexNew(const char* key)
644
CharSelectDataIndex* idx = fcitx_utils_new(CharSelectDataIndex);
645
idx->key = strdup(key);
646
utarray_new(idx->items, &int16_icd);
650
void CharSelectDataAppendToIndex(CharSelectData* charselect, uint16_t unicode, const char* str)
652
UT_array* strings = SplitString(str);
653
utarray_foreach(s, strings, char*) {
654
CharSelectDataIndex* item = NULL;
655
HASH_FIND_STR(charselect->index, *s, item);
657
item = CharSelectDataIndexNew(*s);
658
HASH_ADD_KEYPTR(hh, charselect->index, item->key, strlen(item->key), item);
660
utarray_push_back(item->items, &unicode);
662
utarray_free(strings);
665
void CharSelectDataDump(CharSelectData* charselect)
667
//CharSelectDataIndex* item = charselect->index;
670
fprintf(stderr, "%s\n", item->key);
671
item = item->hh.next;
674
utarray_foreach(p, charselect->indexList, CharSelectDataIndex*) {
675
fprintf(stderr, "%s\n", (*p)->key);
679
void CharSelectDataCreateIndex(CharSelectData* charselect)
682
const char* data = charselect->dataFile;
683
const uint32_t nameOffsetBegin = FromLittleEndian32(data+4);
684
const uint32_t nameOffsetEnd = FromLittleEndian32(data+8);
686
int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1;
690
for (pos = 0; pos <= max; pos++) {
691
const uint16_t unicode = FromLittleEndian16(data + nameOffsetBegin + pos*6);
692
uint32_t offset = FromLittleEndian32(data + nameOffsetBegin + pos*6 + 2);
694
CharSelectDataAppendToIndex(charselect, unicode, (data + offset + 1));
698
const uint32_t detailsOffsetBegin = FromLittleEndian32(data+12);
699
const uint32_t detailsOffsetEnd = FromLittleEndian32(data+16);
701
max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1;
702
for (pos = 0; pos <= max; pos++) {
703
const uint16_t unicode = FromLittleEndian16(data + detailsOffsetBegin + pos*27);
706
const uint8_t aliasCount = * (uint8_t *)(data + detailsOffsetBegin + pos*27 + 6);
707
uint32_t aliasOffset = FromLittleEndian32(data + detailsOffsetBegin + pos*27 + 2);
709
for (j = 0; j < aliasCount; j++) {
710
CharSelectDataAppendToIndex(charselect, unicode, data + aliasOffset);
711
aliasOffset += strlen(data + aliasOffset) + 1;
715
const uint8_t notesCount = * (uint8_t *)(data + detailsOffsetBegin + pos*27 + 11);
716
uint32_t notesOffset = FromLittleEndian32(data + detailsOffsetBegin + pos*27 + 7);
718
for (j = 0; j < notesCount; j++) {
719
CharSelectDataAppendToIndex(charselect, unicode, data + notesOffset);
720
notesOffset += strlen(data + notesOffset) + 1;
723
// approximate equivalents
724
const uint8_t apprCount = * (uint8_t *)(data + detailsOffsetBegin + pos*27 + 16);
725
uint32_t apprOffset = FromLittleEndian32(data + detailsOffsetBegin + pos*27 + 12);
727
for (j = 0; j < apprCount; j++) {
728
CharSelectDataAppendToIndex(charselect, unicode,data + apprOffset);
729
apprOffset += strlen(data + apprOffset) + 1;
733
const uint8_t equivCount = * (uint8_t *)(data + detailsOffsetBegin + pos*27 + 21);
734
uint32_t equivOffset = FromLittleEndian32(data + detailsOffsetBegin + pos*27 + 17);
736
for (j = 0; j < equivCount; j++) {
737
CharSelectDataAppendToIndex(charselect, unicode, data + equivOffset);
738
equivOffset += strlen(data + equivOffset) + 1;
741
// see also - convert to string (hex)
742
const uint8_t seeAlsoCount = * (uint8_t *)(data + detailsOffsetBegin + pos*27 + 26);
743
uint32_t seeAlsoOffset = FromLittleEndian32(data + detailsOffsetBegin + pos*27 + 22);
745
for (j = 0; j < seeAlsoCount; j++) {
746
uint16_t seeAlso = FromLittleEndian16 (data + seeAlsoOffset);
747
char* code = FormatCode(seeAlso, 4, "");
748
CharSelectDataAppendToIndex(charselect, unicode, code);
750
equivOffset += strlen(data + equivOffset) + 1;
755
// temporary disabled due to the huge amount of data
756
const uint32_t unihanOffsetBegin = FromLittleEndian32(data+36);
757
const uint32_t unihanOffsetEnd = charselect->size;
758
max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1;
760
for (pos = 0; pos <= max; pos++) {
761
const uint16_t unicode = FromLittleEndian16(data + unihanOffsetBegin + pos*30);
762
for(j = 0; j < 7; j++) {
763
uint32_t offset = FromLittleEndian32(data + unihanOffsetBegin + pos*30 + 2 + j*4);
765
CharSelectDataAppendToIndex(charselect, unicode, (data + offset));
770
utarray_new(charselect->indexList, fcitx_ptr_icd);
772
CharSelectDataIndex* idx = charselect->index;
774
utarray_push_back(charselect->indexList, &idx);
778
utarray_sort(charselect->indexList, pindex_cmp);
781
void CharSelectDataFree(CharSelectData* charselect)
783
utarray_free(charselect->indexList);
784
while(charselect->index) {
785
CharSelectDataIndex* p = charselect->index;
786
HASH_DEL(charselect->index, p);
788
utarray_free(p->items);
791
free(charselect->dataFile);