1
/* ------------------------------------------------------------------------
3
unicodedata -- Provides access to the Unicode database.
5
Data was extracted from the UnicodeData.txt file.
6
The current version number is reported in the unidata_version constant.
8
Written by Marc-Andre Lemburg (mal@lemburg.com).
9
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10
Modified by Martin v. Lƶwis (martin@v.loewis.de)
12
Copyright (c) Corporation for National Research Initiatives.
14
------------------------------------------------------------------------ */
18
#include "structmember.h"
24
/*[clinic checksum: da39a3ee5e6b4b0d3255bfef95601890afd80709]*/
26
/* character properties */
29
const unsigned char category; /* index into
30
_PyUnicode_CategoryNames */
31
const unsigned char combining; /* combining class value 0 - 255 */
32
const unsigned char bidirectional; /* index into
33
_PyUnicode_BidirectionalNames */
34
const unsigned char mirrored; /* true if mirrored in bidir mode */
35
const unsigned char east_asian_width; /* index into
36
_PyUnicode_EastAsianWidth */
37
const unsigned char normalization_quick_check; /* see is_normalized() */
38
} _PyUnicode_DatabaseRecord;
40
typedef struct change_record {
41
/* sequence of fields should be the same as in merge_old_version */
42
const unsigned char bidir_changed;
43
const unsigned char category_changed;
44
const unsigned char decimal_changed;
45
const unsigned char mirrored_changed;
46
const double numeric_changed;
49
/* data file generated by Tools/unicode/makeunicodedata.py */
50
#include "unicodedata_db.h"
52
static const _PyUnicode_DatabaseRecord*
53
_getrecord_ex(Py_UCS4 code)
59
index = index1[(code>>SHIFT)];
60
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
63
return &_PyUnicode_Database_Records[index];
66
/* ------------- Previous-version API ------------------------------------- */
67
typedef struct previous_version {
70
const change_record* (*getrecord)(Py_UCS4);
71
Py_UCS4 (*normalization)(Py_UCS4);
74
#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
76
static PyMemberDef DB_members[] = {
77
{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
81
/* forward declaration */
82
static PyTypeObject UCD_Type;
83
#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
86
new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
87
Py_UCS4 (*normalization)(Py_UCS4))
89
PreviousDBVersion *self;
90
self = PyObject_New(PreviousDBVersion, &UCD_Type);
94
self->getrecord = getrecord;
95
self->normalization = normalization;
96
return (PyObject*)self;
100
static Py_UCS4 getuchar(PyUnicodeObject *obj)
102
if (PyUnicode_READY(obj))
104
if (PyUnicode_GET_LENGTH(obj) == 1) {
105
if (PyUnicode_READY(obj))
107
return PyUnicode_READ_CHAR(obj, 0);
109
PyErr_SetString(PyExc_TypeError,
110
"need a single Unicode character as parameter");
114
/* --- Module API --------------------------------------------------------- */
118
unicodedata.UCD.decimal
120
unichr: object(type='str')
124
Converts a Unicode character into its equivalent decimal value.
126
Returns the decimal value assigned to the Unicode character unichr
127
as integer. If no such value is defined, default is returned, or, if
128
not given, ValueError is raised.
131
PyDoc_STRVAR(unicodedata_UCD_decimal__doc__,
132
"decimal(unichr, default=None)\n"
133
"Converts a Unicode character into its equivalent decimal value.\n"
135
"Returns the decimal value assigned to the Unicode character unichr\n"
136
"as integer. If no such value is defined, default is returned, or, if\n"
137
"not given, ValueError is raised.");
139
#define UNICODEDATA_UCD_DECIMAL_METHODDEF \
140
{"decimal", (PyCFunction)unicodedata_UCD_decimal, METH_VARARGS, unicodedata_UCD_decimal__doc__},
143
unicodedata_UCD_decimal_impl(PyObject *self, PyObject *unichr, PyObject *default_value);
146
unicodedata_UCD_decimal(PyObject *self, PyObject *args)
148
PyObject *return_value = NULL;
150
PyObject *default_value = NULL;
152
if (!PyArg_ParseTuple(args,
154
&PyUnicode_Type, &unichr, &default_value))
156
return_value = unicodedata_UCD_decimal_impl(self, unichr, default_value);
163
unicodedata_UCD_decimal_impl(PyObject *self, PyObject *unichr, PyObject *default_value)
164
/*[clinic checksum: 9576fa55f4ea0be82968af39dc9d0283e634beeb]*/
166
PyUnicodeObject *v = (PyUnicodeObject *)unichr;
172
if (c == (Py_UCS4)-1)
175
if (self && UCD_Check(self)) {
176
const change_record *old = get_old_record(self, c);
177
if (old->category_changed == 0) {
182
else if (old->decimal_changed != 0xFF) {
184
rc = old->decimal_changed;
189
rc = Py_UNICODE_TODECIMAL(c);
191
if (default_value == NULL) {
192
PyErr_SetString(PyExc_ValueError,
197
Py_INCREF(default_value);
198
return default_value;
201
return PyLong_FromLong(rc);
204
PyDoc_STRVAR(unicodedata_digit__doc__,
205
"digit(unichr[, default])\n\
207
Returns the digit value assigned to the Unicode character unichr as\n\
208
integer. If no such value is defined, default is returned, or, if\n\
209
not given, ValueError is raised.");
212
unicodedata_digit(PyObject *self, PyObject *args)
215
PyObject *defobj = NULL;
219
if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
222
if (c == (Py_UCS4)-1)
224
rc = Py_UNICODE_TODIGIT(c);
226
if (defobj == NULL) {
227
PyErr_SetString(PyExc_ValueError, "not a digit");
235
return PyLong_FromLong(rc);
238
PyDoc_STRVAR(unicodedata_numeric__doc__,
239
"numeric(unichr[, default])\n\
241
Returns the numeric value assigned to the Unicode character unichr\n\
242
as float. If no such value is defined, default is returned, or, if\n\
243
not given, ValueError is raised.");
246
unicodedata_numeric(PyObject *self, PyObject *args)
249
PyObject *defobj = NULL;
254
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
257
if (c == (Py_UCS4)-1)
260
if (self && UCD_Check(self)) {
261
const change_record *old = get_old_record(self, c);
262
if (old->category_changed == 0) {
267
else if (old->decimal_changed != 0xFF) {
269
rc = old->decimal_changed;
274
rc = Py_UNICODE_TONUMERIC(c);
276
if (defobj == NULL) {
277
PyErr_SetString(PyExc_ValueError, "not a numeric character");
285
return PyFloat_FromDouble(rc);
288
PyDoc_STRVAR(unicodedata_category__doc__,
291
Returns the general category assigned to the Unicode character\n\
295
unicodedata_category(PyObject *self, PyObject *args)
301
if (!PyArg_ParseTuple(args, "O!:category",
302
&PyUnicode_Type, &v))
305
if (c == (Py_UCS4)-1)
307
index = (int) _getrecord_ex(c)->category;
308
if (self && UCD_Check(self)) {
309
const change_record *old = get_old_record(self, c);
310
if (old->category_changed != 0xFF)
311
index = old->category_changed;
313
return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
316
PyDoc_STRVAR(unicodedata_bidirectional__doc__,
317
"bidirectional(unichr)\n\
319
Returns the bidirectional class assigned to the Unicode character\n\
320
unichr as string. If no such value is defined, an empty string is\n\
324
unicodedata_bidirectional(PyObject *self, PyObject *args)
330
if (!PyArg_ParseTuple(args, "O!:bidirectional",
331
&PyUnicode_Type, &v))
334
if (c == (Py_UCS4)-1)
336
index = (int) _getrecord_ex(c)->bidirectional;
337
if (self && UCD_Check(self)) {
338
const change_record *old = get_old_record(self, c);
339
if (old->category_changed == 0)
340
index = 0; /* unassigned */
341
else if (old->bidir_changed != 0xFF)
342
index = old->bidir_changed;
344
return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
347
PyDoc_STRVAR(unicodedata_combining__doc__,
348
"combining(unichr)\n\
350
Returns the canonical combining class assigned to the Unicode\n\
351
character unichr as integer. Returns 0 if no combining class is\n\
355
unicodedata_combining(PyObject *self, PyObject *args)
361
if (!PyArg_ParseTuple(args, "O!:combining",
362
&PyUnicode_Type, &v))
365
if (c == (Py_UCS4)-1)
367
index = (int) _getrecord_ex(c)->combining;
368
if (self && UCD_Check(self)) {
369
const change_record *old = get_old_record(self, c);
370
if (old->category_changed == 0)
371
index = 0; /* unassigned */
373
return PyLong_FromLong(index);
376
PyDoc_STRVAR(unicodedata_mirrored__doc__,
379
Returns the mirrored property assigned to the Unicode character\n\
380
unichr as integer. Returns 1 if the character has been identified as\n\
381
a \"mirrored\" character in bidirectional text, 0 otherwise.");
384
unicodedata_mirrored(PyObject *self, PyObject *args)
390
if (!PyArg_ParseTuple(args, "O!:mirrored",
391
&PyUnicode_Type, &v))
394
if (c == (Py_UCS4)-1)
396
index = (int) _getrecord_ex(c)->mirrored;
397
if (self && UCD_Check(self)) {
398
const change_record *old = get_old_record(self, c);
399
if (old->category_changed == 0)
400
index = 0; /* unassigned */
401
else if (old->mirrored_changed != 0xFF)
402
index = old->mirrored_changed;
404
return PyLong_FromLong(index);
407
PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
408
"east_asian_width(unichr)\n\
410
Returns the east asian width assigned to the Unicode character\n\
414
unicodedata_east_asian_width(PyObject *self, PyObject *args)
420
if (!PyArg_ParseTuple(args, "O!:east_asian_width",
421
&PyUnicode_Type, &v))
424
if (c == (Py_UCS4)-1)
426
index = (int) _getrecord_ex(c)->east_asian_width;
427
if (self && UCD_Check(self)) {
428
const change_record *old = get_old_record(self, c);
429
if (old->category_changed == 0)
430
index = 0; /* unassigned */
432
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
435
PyDoc_STRVAR(unicodedata_decomposition__doc__,
436
"decomposition(unichr)\n\
438
Returns the character decomposition mapping assigned to the Unicode\n\
439
character unichr as string. An empty string is returned in case no\n\
440
such mapping is defined.");
443
unicodedata_decomposition(PyObject *self, PyObject *args)
447
int code, index, count;
449
unsigned int prefix_index;
452
if (!PyArg_ParseTuple(args, "O!:decomposition",
453
&PyUnicode_Type, &v))
456
if (c == (Py_UCS4)-1)
461
if (self && UCD_Check(self)) {
462
const change_record *old = get_old_record(self, c);
463
if (old->category_changed == 0)
464
return PyUnicode_FromString(""); /* unassigned */
467
if (code < 0 || code >= 0x110000)
470
index = decomp_index1[(code>>DECOMP_SHIFT)];
471
index = decomp_index2[(index<<DECOMP_SHIFT)+
472
(code&((1<<DECOMP_SHIFT)-1))];
475
/* high byte is number of hex bytes (usually one or two), low byte
476
is prefix code (from*/
477
count = decomp_data[index] >> 8;
479
/* XXX: could allocate the PyString up front instead
480
(strlen(prefix) + 5 * count + 1 bytes) */
482
/* Based on how index is calculated above and decomp_data is generated
483
from Tools/unicode/makeunicodedata.py, it should not be possible
484
to overflow decomp_prefix. */
485
prefix_index = decomp_data[index] & 255;
486
assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
489
i = strlen(decomp_prefix[prefix_index]);
490
memcpy(decomp, decomp_prefix[prefix_index], i);
492
while (count-- > 0) {
495
assert(i < sizeof(decomp));
496
PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
497
decomp_data[++index]);
498
i += strlen(decomp + i);
500
return PyUnicode_FromStringAndSize(decomp, i);
504
get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
506
if (code >= 0x110000) {
508
} else if (self && UCD_Check(self) &&
509
get_old_record(self, code)->category_changed==0) {
510
/* unassigned in old version */
514
*index = decomp_index1[(code>>DECOMP_SHIFT)];
515
*index = decomp_index2[(*index<<DECOMP_SHIFT)+
516
(code&((1<<DECOMP_SHIFT)-1))];
519
/* high byte is number of hex bytes (usually one or two), low byte
520
is prefix code (from*/
521
*count = decomp_data[*index] >> 8;
522
*prefix = decomp_data[*index] & 255;
534
#define NCount (VCount*TCount)
535
#define SCount (LCount*NCount)
538
nfd_nfkd(PyObject *self, PyObject *input, int k)
542
Py_ssize_t i, o, osize;
545
/* Longest decomposition in Unicode 3.2: U+FDFA */
547
Py_ssize_t space, isize;
548
int index, prefix, count, stackptr;
549
unsigned char prev, cur;
552
isize = PyUnicode_GET_LENGTH(input);
553
/* Overallocate at most 10 characters. */
554
space = (isize > 10 ? 10 : isize) + isize;
556
output = PyMem_Malloc(space * sizeof(Py_UCS4));
562
kind = PyUnicode_KIND(input);
563
data = PyUnicode_DATA(input);
566
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
568
Py_UCS4 code = stack[--stackptr];
569
/* Hangul Decomposition adds three characters in
570
a single step, so we need at least that much room. */
575
new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
576
if (new_output == NULL) {
583
/* Hangul Decomposition. */
584
if (SBase <= code && code < (SBase+SCount)) {
585
int SIndex = code - SBase;
586
int L = LBase + SIndex / NCount;
587
int V = VBase + (SIndex % NCount) / TCount;
588
int T = TBase + SIndex % TCount;
598
/* normalization changes */
599
if (self && UCD_Check(self)) {
600
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
602
stack[stackptr++] = value;
607
/* Other decompositions. */
608
get_decomp_record(self, code, &index, &prefix, &count);
610
/* Copy character if it is not decomposable, or has a
611
compatibility decomposition, but we do NFD. */
612
if (!count || (prefix && !k)) {
617
/* Copy decomposition onto the stack, in reverse
620
code = decomp_data[index + (--count)];
621
stack[stackptr++] = code;
626
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
631
/* result is guaranteed to be ready, as it is compact. */
632
kind = PyUnicode_KIND(result);
633
data = PyUnicode_DATA(result);
635
/* Sort canonically. */
637
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
638
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
639
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
640
if (prev == 0 || cur == 0 || prev <= cur) {
644
/* Non-canonical order. Need to switch *i with previous. */
647
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
648
PyUnicode_WRITE(kind, data, o+1,
649
PyUnicode_READ(kind, data, o));
650
PyUnicode_WRITE(kind, data, o, tmp);
654
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
655
if (prev == 0 || prev <= cur)
658
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
664
find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
667
for (index = 0; nfc[index].start; index++) {
668
unsigned int start = nfc[index].start;
671
if (code <= start + nfc[index].count) {
672
unsigned int delta = code - start;
673
return nfc[index].index + delta;
680
nfc_nfkc(PyObject *self, PyObject *input, int k)
686
Py_ssize_t i, i1, o, len;
687
int f,l,index,index1,comb;
689
Py_ssize_t skipped[20];
692
result = nfd_nfkd(self, input, k);
695
/* result will be "ready". */
696
kind = PyUnicode_KIND(result);
697
data = PyUnicode_DATA(result);
698
len = PyUnicode_GET_LENGTH(result);
700
/* We allocate a buffer for the output.
701
If we find that we made no changes, we still return
703
output = PyMem_Malloc(len * sizeof(Py_UCS4));
713
for (index = 0; index < cskipped; index++) {
714
if (skipped[index] == i) {
715
/* *i character is skipped.
717
skipped[index] = skipped[cskipped-1];
720
goto again; /* continue while */
723
/* Hangul Composition. We don't need to check for <LV,T>
724
pairs, since we always have decomposed data. */
725
code = PyUnicode_READ(kind, data, i);
726
if (LBase <= code && code < (LBase+LCount) &&
728
VBase <= PyUnicode_READ(kind, data, i+1) &&
729
PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
731
LIndex = code - LBase;
732
VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
733
code = SBase + (LIndex*VCount+VIndex)*TCount;
736
TBase <= PyUnicode_READ(kind, data, i) &&
737
PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
738
code += PyUnicode_READ(kind, data, i)-TBase;
745
/* code is still input[i] here */
746
f = find_nfc_index(self, nfc_first, code);
752
/* Find next unblocked character. */
755
/* output base character for now; might be updated later. */
756
output[o] = PyUnicode_READ(kind, data, i);
758
Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
759
int comb1 = _getrecord_ex(code1)->combining;
764
/* Character is blocked. */
769
l = find_nfc_index(self, nfc_last, code1);
770
/* i1 cannot be combined with i. If i1
771
is a starter, we don't need to look further.
772
Otherwise, record the combining class. */
781
index = f*TOTAL_LAST + l;
782
index1 = comp_index[index >> COMP_SHIFT];
783
code = comp_data[(index1<<COMP_SHIFT)+
784
(index&((1<<COMP_SHIFT)-1))];
788
/* Replace the original character. */
790
/* Mark the second character unused. */
791
assert(cskipped < 20);
792
skipped[cskipped++] = i1;
794
f = find_nfc_index(self, nfc_first, output[o]);
798
/* Output character was already written.
799
Just advance the indices. */
803
/* No changes. Return original string. */
808
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
814
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
816
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
821
unsigned char prev_combining = 0, quickcheck_mask;
823
/* An older version of the database is requested, quickchecks must be
825
if (self && UCD_Check(self))
828
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
829
as described in http://unicode.org/reports/tr15/#Annex8. */
830
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
833
kind = PyUnicode_KIND(input);
834
data = PyUnicode_DATA(input);
835
len = PyUnicode_GET_LENGTH(input);
837
Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
838
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
839
unsigned char combining = record->combining;
840
unsigned char quickcheck = record->normalization_quick_check;
842
if (quickcheck & quickcheck_mask)
843
return 0; /* this string might need normalization */
844
if (combining && prev_combining > combining)
845
return 0; /* non-canonical sort order, not normalized */
846
prev_combining = combining;
848
return 1; /* certainly normalized */
851
PyDoc_STRVAR(unicodedata_normalize__doc__,
852
"normalize(form, unistr)\n\
854
Return the normal form 'form' for the Unicode string unistr. Valid\n\
855
values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
858
unicodedata_normalize(PyObject *self, PyObject *args)
863
if(!PyArg_ParseTuple(args, "sO!:normalize",
864
&form, &PyUnicode_Type, &input))
867
if (PyUnicode_READY(input) == -1)
870
if (PyUnicode_GET_LENGTH(input) == 0) {
871
/* Special case empty input strings, since resizing
872
them later would cause internal errors. */
877
if (strcmp(form, "NFC") == 0) {
878
if (is_normalized(self, input, 1, 0)) {
882
return nfc_nfkc(self, input, 0);
884
if (strcmp(form, "NFKC") == 0) {
885
if (is_normalized(self, input, 1, 1)) {
889
return nfc_nfkc(self, input, 1);
891
if (strcmp(form, "NFD") == 0) {
892
if (is_normalized(self, input, 0, 0)) {
896
return nfd_nfkd(self, input, 0);
898
if (strcmp(form, "NFKD") == 0) {
899
if (is_normalized(self, input, 0, 1)) {
903
return nfd_nfkd(self, input, 1);
905
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
909
/* -------------------------------------------------------------------- */
910
/* unicode character name tables */
912
/* data file generated by Tools/unicode/makeunicodedata.py */
913
#include "unicodename_db.h"
915
/* -------------------------------------------------------------------- */
916
/* database code (cut and pasted from the unidb package) */
919
_gethash(const char *s, int len, int scale)
924
for (i = 0; i < len; i++) {
925
h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
928
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
933
static char *hangul_syllables[][3] = {
937
{ "D", "YAE", "GS" },
938
{ "DD", "EO", "N", },
940
{ "M", "YEO", "NH" },
944
{ "SS", "WAE", "LM" },
948
{ "C", "WEO", "LP" },
964
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
966
is_unified_ideograph(Py_UCS4 code)
969
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
970
(0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
971
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
972
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
973
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
976
/* macros used to determine if the given codepoint is in the PUA range that
977
* we are using to store aliases and named sequences */
978
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
979
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
980
(cp < named_sequences_end))
983
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
984
int with_alias_and_seq)
986
/* Find the name associated with the given codepoint.
987
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
988
* that we are using for aliases and named sequences. */
994
if (code >= 0x110000)
997
/* XXX should we just skip all the codepoints in the PUAs here? */
998
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1001
if (self && UCD_Check(self)) {
1002
/* in 3.2.0 there are no aliases and named sequences */
1003
const change_record *old;
1004
if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1006
old = get_old_record(self, code);
1007
if (old->category_changed == 0) {
1013
if (SBase <= code && code < SBase+SCount) {
1014
/* Hangul syllable. */
1015
int SIndex = code - SBase;
1016
int L = SIndex / NCount;
1017
int V = (SIndex % NCount) / TCount;
1018
int T = SIndex % TCount;
1021
/* Worst case: HANGUL SYLLABLE <10chars>. */
1023
strcpy(buffer, "HANGUL SYLLABLE ");
1025
strcpy(buffer, hangul_syllables[L][0]);
1026
buffer += strlen(hangul_syllables[L][0]);
1027
strcpy(buffer, hangul_syllables[V][1]);
1028
buffer += strlen(hangul_syllables[V][1]);
1029
strcpy(buffer, hangul_syllables[T][2]);
1030
buffer += strlen(hangul_syllables[T][2]);
1035
if (is_unified_ideograph(code)) {
1037
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1039
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1043
/* get offset into phrasebook */
1044
offset = phrasebook_offset1[(code>>phrasebook_shift)];
1045
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1046
(code&((1<<phrasebook_shift)-1))];
1053
/* get word index */
1054
word = phrasebook[offset] - phrasebook_short;
1056
word = (word << 8) + phrasebook[offset+1];
1059
word = phrasebook[offset++];
1062
return 0; /* buffer overflow */
1065
/* copy word string from lexicon. the last character in the
1066
word has bit 7 set. the last word in a string ends with
1068
w = lexicon + lexicon_offset[word];
1071
return 0; /* buffer overflow */
1075
return 0; /* buffer overflow */
1076
buffer[i++] = *w & 127;
1078
break; /* end of word */
1085
_cmpname(PyObject *self, int code, const char* name, int namelen)
1087
/* check if code corresponds to the given name */
1089
char buffer[NAME_MAXLEN];
1090
if (!_getucname(self, code, buffer, sizeof(buffer), 1))
1092
for (i = 0; i < namelen; i++) {
1093
if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
1096
return buffer[namelen] == '\0';
1100
find_syllable(const char *str, int *len, int *pos, int count, int column)
1104
for (i = 0; i < count; i++) {
1105
char *s = hangul_syllables[i][column];
1106
len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1109
if (strncmp(str, s, len1) == 0) {
1120
_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1122
/* check if named sequences are allowed */
1123
if (!with_named_seq && IS_NAMED_SEQ(cp))
1125
/* if the codepoint is in the PUA range that we use for aliases,
1126
* convert it to obtain the right codepoint */
1128
*code = name_aliases[cp-aliases_start];
1135
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1138
/* Return the codepoint associated with the given name.
1139
* Named aliases are resolved too (unless self != NULL (i.e. we are using
1140
* 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
1141
* using for the named sequence, and the caller must then convert it. */
1143
unsigned int mask = code_size-1;
1144
unsigned int i, incr;
1146
/* Check for hangul syllables. */
1147
if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1148
int len, L = -1, V = -1, T = -1;
1149
const char *pos = name + 16;
1150
find_syllable(pos, &len, &L, LCount, 0);
1152
find_syllable(pos, &len, &V, VCount, 1);
1154
find_syllable(pos, &len, &T, TCount, 2);
1156
if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1157
*code = SBase + (L*VCount+V)*TCount + T;
1160
/* Otherwise, it's an illegal syllable name. */
1164
/* Check for unified ideographs. */
1165
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1166
/* Four or five hexdigits must follow. */
1170
if (namelen != 4 && namelen != 5)
1174
if (*name >= '0' && *name <= '9')
1176
else if (*name >= 'A' && *name <= 'F')
1177
v += *name - 'A' + 10;
1182
if (!is_unified_ideograph(v))
1188
/* the following is the same as python's dictionary lookup, with
1189
only minor changes. see the makeunicodedata script for more
1192
h = (unsigned int) _gethash(name, namelen, code_magic);
1197
if (_cmpname(self, v, name, namelen))
1198
return _check_alias_and_seq(v, code, with_named_seq);
1199
incr = (h ^ (h >> 3)) & mask;
1203
i = (i + incr) & mask;
1207
if (_cmpname(self, v, name, namelen))
1208
return _check_alias_and_seq(v, code, with_named_seq);
1211
incr = incr ^ code_poly;
1215
static const _PyUnicode_Name_CAPI hashAPI =
1217
sizeof(_PyUnicode_Name_CAPI),
1222
/* -------------------------------------------------------------------- */
1223
/* Python bindings */
1225
PyDoc_STRVAR(unicodedata_name__doc__,
1226
"name(unichr[, default])\n\
1227
Returns the name assigned to the Unicode character unichr as a\n\
1228
string. If no name is defined, default is returned, or, if not\n\
1229
given, ValueError is raised.");
1232
unicodedata_name(PyObject* self, PyObject* args)
1234
char name[NAME_MAXLEN];
1238
PyObject* defobj = NULL;
1239
if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1243
if (c == (Py_UCS4)-1)
1246
if (!_getucname(self, c, name, sizeof(name), 0)) {
1247
if (defobj == NULL) {
1248
PyErr_SetString(PyExc_ValueError, "no such name");
1257
return PyUnicode_FromString(name);
1260
PyDoc_STRVAR(unicodedata_lookup__doc__,
1263
Look up character by name. If a character with the\n\
1264
given name is found, return the corresponding Unicode\n\
1265
character. If not found, KeyError is raised.");
1268
unicodedata_lookup(PyObject* self, PyObject* args)
1275
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1278
if (!_getcode(self, name, namelen, &code, 1)) {
1279
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1282
/* check if code is in the PUA range that we use for named sequences
1284
if (IS_NAMED_SEQ(code)) {
1285
index = code-named_sequences_start;
1286
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1287
named_sequences[index].seq,
1288
named_sequences[index].seqlen);
1290
return PyUnicode_FromOrdinal(code);
1293
/* XXX Add doc strings. */
1295
static PyMethodDef unicodedata_functions[] = {
1296
UNICODEDATA_UCD_DECIMAL_METHODDEF
1297
{"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1298
{"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1299
{"category", unicodedata_category, METH_VARARGS,
1300
unicodedata_category__doc__},
1301
{"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1302
unicodedata_bidirectional__doc__},
1303
{"combining", unicodedata_combining, METH_VARARGS,
1304
unicodedata_combining__doc__},
1305
{"mirrored", unicodedata_mirrored, METH_VARARGS,
1306
unicodedata_mirrored__doc__},
1307
{"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1308
unicodedata_east_asian_width__doc__},
1309
{"decomposition", unicodedata_decomposition, METH_VARARGS,
1310
unicodedata_decomposition__doc__},
1311
{"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1312
{"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1313
{"normalize", unicodedata_normalize, METH_VARARGS,
1314
unicodedata_normalize__doc__},
1315
{NULL, NULL} /* sentinel */
1318
static PyTypeObject UCD_Type = {
1319
/* The ob_type field must be initialized in the module init function
1320
* to be portable to Windows without using C++. */
1321
PyVarObject_HEAD_INIT(NULL, 0)
1322
"unicodedata.UCD", /*tp_name*/
1323
sizeof(PreviousDBVersion), /*tp_basicsize*/
1326
(destructor)PyObject_Del, /*tp_dealloc*/
1333
0, /*tp_as_sequence*/
1334
0, /*tp_as_mapping*/
1338
PyObject_GenericGetAttr,/*tp_getattro*/
1341
Py_TPFLAGS_DEFAULT, /*tp_flags*/
1345
0, /*tp_richcompare*/
1346
0, /*tp_weaklistoffset*/
1349
unicodedata_functions, /*tp_methods*/
1350
DB_members, /*tp_members*/
1356
0, /*tp_dictoffset*/
1364
PyDoc_STRVAR(unicodedata_docstring,
1365
"This module provides access to the Unicode Character Database which\n\
1366
defines character properties for all Unicode characters. The data in\n\
1367
this database is based on the UnicodeData.txt file version\n\
1368
" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
1370
The module uses the same names and symbols as defined by the\n\
1371
UnicodeData File Format " UNIDATA_VERSION ".");
1373
static struct PyModuleDef unicodedatamodule = {
1374
PyModuleDef_HEAD_INIT,
1376
unicodedata_docstring,
1378
unicodedata_functions,
1386
PyInit_unicodedata(void)
1390
Py_TYPE(&UCD_Type) = &PyType_Type;
1392
m = PyModule_Create(&unicodedatamodule);
1396
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1397
Py_INCREF(&UCD_Type);
1398
PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1400
/* Previous versions */
1401
v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1403
PyModule_AddObject(m, "ucd_3_2_0", v);
1406
v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1408
PyModule_AddObject(m, "ucnhash_CAPI", v);
1415
indent-tabs-mode: nil