1
/* nfkc.c Unicode normalization utilities.
2
* Copyright (C) 2002, 2003 Simon Josefsson
4
* This file is part of GNU Libidn.
6
* GNU Libidn is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
11
* GNU Libidn is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with GNU Libidn; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24
/* This file contains functions from GLIB, including gutf8.c and
25
* gunidecomp.c, all licensed under LGPL and copyright hold by:
27
* Copyright (C) 1999, 2000 Tom Tromey
28
* Copyright 2000 Red Hat, Inc.
31
/* Hacks to make syncing with GLIB code easier. */
34
#define guchar unsigned char
37
#define guint unsigned int
38
#define gushort unsigned short
39
#define gint16 my_int16_t
40
#define guint16 my_uint16_t
41
#define gunichar my_uint32_t
43
#define gssize ssize_t
44
#define g_malloc malloc
47
#define g_set_error(a,b,c,d) 0
48
#define g_new(struct_type, n_structs) \
49
((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
50
# if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
51
# define G_STMT_START (void)(
54
# if (defined (sun) || defined (__sun__))
55
# define G_STMT_START if (1)
56
# define G_STMT_END else (void)0
58
# define G_STMT_START do
59
# define G_STMT_END while (0)
62
#define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
63
#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
67
/* Code from GLIB gunicode.h starts here. */
72
G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
73
G_NORMALIZE_DEFAULT_COMPOSE,
74
G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
76
G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
77
G_NORMALIZE_ALL_COMPOSE,
78
G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
82
/* Code from GLIB gutf8.c starts here. */
84
#define UTF8_COMPUTE(Char, Mask, Len) \
90
else if ((Char & 0xe0) == 0xc0) \
95
else if ((Char & 0xf0) == 0xe0) \
100
else if ((Char & 0xf8) == 0xf0) \
105
else if ((Char & 0xfc) == 0xf8) \
110
else if ((Char & 0xfe) == 0xfc) \
118
#define UTF8_LENGTH(Char) \
119
((Char) < 0x80 ? 1 : \
120
((Char) < 0x800 ? 2 : \
121
((Char) < 0x10000 ? 3 : \
122
((Char) < 0x200000 ? 4 : \
123
((Char) < 0x4000000 ? 5 : 6)))))
126
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
127
(Result) = (Chars)[0] & (Mask); \
128
for ((Count) = 1; (Count) < (Len); ++(Count)) \
130
if (((Chars)[(Count)] & 0xc0) != 0x80) \
136
(Result) |= ((Chars)[(Count)] & 0x3f); \
139
#define UNICODE_VALID(Char) \
140
((Char) < 0x110000 && \
141
(((Char) & 0xFFFFF800) != 0xD800) && \
142
((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
143
((Char) & 0xFFFE) != 0xFFFE)
146
static const gchar utf8_skip_data[256] = {
147
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
149
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
151
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
155
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
161
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
165
const gchar *const g_utf8_skip = utf8_skip_data;
167
#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
171
* @p: pointer to the start of a UTF-8 encoded string.
172
* @max: the maximum number of bytes to examine. If @max
173
* is less than 0, then the string is assumed to be
174
* nul-terminated. If @max is 0, @p will not be examined and
177
* Returns the length of the string in characters.
179
* Return value: the length of the string in characters
182
g_utf8_strlen (const gchar * p, gssize max)
185
const gchar *start = p;
186
g_return_val_if_fail (p != NULL || max == 0, 0);
192
p = g_utf8_next_char (p);
201
p = g_utf8_next_char (p);
203
while (p - start < max && *p)
206
p = g_utf8_next_char (p);
209
/* only do the last len increment if we got a complete
210
* char (don't count partial chars)
212
if (p - start == max)
221
* @p: a pointer to Unicode character encoded as UTF-8
223
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
224
* If @p does not point to a valid UTF-8 encoded character, results are
225
* undefined. If you are not sure that the bytes are complete
226
* valid Unicode characters, you should use g_utf8_get_char_validated()
229
* Return value: the resulting character
232
g_utf8_get_char (const gchar * p)
234
int i, mask = 0, len;
236
unsigned char c = (unsigned char) *p;
238
UTF8_COMPUTE (c, mask, len);
240
return (gunichar) - 1;
241
UTF8_GET (result, p, i, mask, len);
248
* @c: a ISO10646 character code
249
* @outbuf: output buffer, must have at least 6 bytes of space.
250
* If %NULL, the length will be computed and returned
251
* and nothing will be written to @outbuf.
253
* Converts a single character to UTF-8.
255
* Return value: number of bytes written
258
g_unichar_to_utf8 (gunichar c, gchar * outbuf)
274
else if (c < 0x10000)
279
else if (c < 0x200000)
284
else if (c < 0x4000000)
297
for (i = len - 1; i > 0; --i)
299
outbuf[i] = (c & 0x3f) | 0x80;
302
outbuf[0] = c | first;
309
* g_utf8_to_ucs4_fast:
310
* @str: a UTF-8 encoded string
311
* @len: the maximum length of @str to use. If @len < 0, then
312
* the string is nul-terminated.
313
* @items_written: location to store the number of characters in the
316
* Convert a string from UTF-8 to a 32-bit fixed width
317
* representation as UCS-4, assuming valid UTF-8 input.
318
* This function is roughly twice as fast as g_utf8_to_ucs4()
319
* but does no error checking on the input.
321
* Return value: a pointer to a newly allocated UCS-4 string.
322
* This value must be freed with g_free().
325
g_utf8_to_ucs4_fast (const gchar * str, ssize_t len, size_t * items_written)
332
g_return_val_if_fail (str != NULL, NULL);
340
p = g_utf8_next_char (p);
346
while (p < str + len && *p)
348
p = g_utf8_next_char (p);
353
result = g_new (gunichar, n_chars + 1);
356
for (i = 0; i < n_chars; i++)
358
gunichar wc = ((unsigned char *) p)[0];
393
for (j = 1; j < charlen; j++)
396
wc |= ((unsigned char *) p)[j] & 0x3f;
413
* @str: a UCS-4 encoded string
414
* @len: the maximum length of @str to use. If @len < 0, then
415
* the string is terminated with a 0 character.
416
* @items_read: location to store number of characters read read, or %NULL.
417
* @items_written: location to store number of bytes written or %NULL.
418
* The value here stored does not include the trailing 0
420
* @error: location to store the error occuring, or %NULL to ignore
421
* errors. Any of the errors in #GConvertError other than
422
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
424
* Convert a string from a 32-bit fixed width representation as UCS-4.
425
* to UTF-8. The result will be terminated with a 0 byte.
427
* Return value: a pointer to a newly allocated UTF-8 string.
428
* This value must be freed with g_free(). If an
429
* error occurs, %NULL will be returned and
433
g_ucs4_to_utf8 (const gunichar * str,
435
size_t * items_read, size_t * items_written, GError ** error)
438
gchar *result = NULL;
443
for (i = 0; len < 0 || i < len; i++)
448
if (str[i] >= 0x80000000)
453
/*g_set_error (error, G_CONVERT_ERROR,
454
G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
455
_("Character out of range for UTF-8"));*/
459
result_length += UTF8_LENGTH (str[i]);
462
result = g_malloc (result_length + 1);
466
while (p < result + result_length)
467
p += g_unichar_to_utf8 (str[i++], p);
472
*items_written = p - result;
481
/* Code from GLIB gunidecomp.c starts here. */
483
#include "gunidecomp.h"
484
#include "gunicomp.h"
486
#define CC_PART1(Page, Char) \
487
((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
488
? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
489
: (cclass_data[combining_class_table_part1[Page]][Char]))
491
#define CC_PART2(Page, Char) \
492
((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
493
? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
494
: (cclass_data[combining_class_table_part2[Page]][Char]))
496
#define COMBINING_CLASS(Char) \
497
(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
498
? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
499
: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
500
? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
503
/* constants for hangul syllable [de]composition */
511
#define NCount (VCount * TCount)
512
#define SCount (LCount * NCount)
515
* g_unicode_canonical_ordering:
516
* @string: a UCS-4 encoded string.
517
* @len: the maximum length of @string to use.
519
* Computes the canonical ordering of a string in-place.
520
* This rearranges decomposed characters in the string
521
* according to their combining classes. See the Unicode
522
* manual for more information.
525
g_unicode_canonical_ordering (gunichar * string, gsize len)
534
last = COMBINING_CLASS (string[0]);
535
for (i = 0; i < len - 1; ++i)
537
int next = COMBINING_CLASS (string[i + 1]);
538
if (next != 0 && last > next)
541
/* Percolate item leftward through string. */
542
for (j = i + 1; j > 0; --j)
545
if (COMBINING_CLASS (string[j - 1]) <= next)
548
string[j] = string[j - 1];
552
/* We're re-entering the loop looking at the old
561
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
562
* r should be null or have sufficient space. Calling with r == NULL will
563
* only calculate the result_len; however, a buffer with space for three
564
* characters will always be big enough. */
566
decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
568
gint SIndex = s - SBase;
570
/* not a hangul syllable */
571
if (SIndex < 0 || SIndex >= SCount)
579
gunichar L = LBase + SIndex / NCount;
580
gunichar V = VBase + (SIndex % NCount) / TCount;
581
gunichar T = TBase + SIndex % TCount;
600
/* returns a pointer to a null-terminated UTF-8 string */
602
find_decomposition (gunichar ch, gboolean compat)
605
int end = G_N_ELEMENTS (decomp_table);
607
if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
611
int half = (start + end) / 2;
612
if (ch == decomp_table[half].ch)
618
offset = decomp_table[half].compat_offset;
619
if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
620
offset = decomp_table[half].canon_offset;
624
offset = decomp_table[half].canon_offset;
625
if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
629
return &(decomp_expansion_string[offset]);
631
else if (half == start)
633
else if (ch > decomp_table[half].ch)
643
/* L,V => LV and LV,T => LVT */
645
combine_hangul (gunichar a, gunichar b, gunichar * result)
647
gint LIndex = a - LBase;
648
gint SIndex = a - SBase;
650
gint VIndex = b - VBase;
651
gint TIndex = b - TBase;
653
if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
655
*result = SBase + (LIndex * VCount + VIndex) * TCount;
658
else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
659
&& 0 <= TIndex && TIndex <= TCount)
661
*result = a + TIndex;
668
#define CI(Page, Char) \
669
((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
670
? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
671
: (compose_data[compose_table[Page]][Char]))
673
#define COMPOSE_INDEX(Char) \
674
((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
677
combine (gunichar a, gunichar b, gunichar * result)
679
gushort index_a, index_b;
681
if (combine_hangul (a, b, result))
684
index_a = COMPOSE_INDEX (a);
686
if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
688
if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
691
compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
698
index_b = COMPOSE_INDEX (b);
700
if (index_b >= COMPOSE_SECOND_SINGLE_START)
703
compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
706
compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
713
if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
714
&& index_b >= COMPOSE_SECOND_START
715
&& index_b < COMPOSE_SECOND_SINGLE_START)
718
compose_array[index_a - COMPOSE_FIRST_START][index_b -
719
COMPOSE_SECOND_START];
732
_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
738
gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
739
gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
743
while ((max_len < 0 || p < str + max_len) && *p)
746
gunichar wc = g_utf8_get_char (p);
748
if (wc >= 0xac00 && wc <= 0xd7af)
751
decompose_hangul (wc, NULL, &result_len);
756
decomp = find_decomposition (wc, do_compat);
759
n_wc += g_utf8_strlen (decomp, -1);
764
p = g_utf8_next_char (p);
767
wc_buffer = g_new (gunichar, n_wc + 1);
772
while ((max_len < 0 || p < str + max_len) && *p)
774
gunichar wc = g_utf8_get_char (p);
777
gsize old_n_wc = n_wc;
779
if (wc >= 0xac00 && wc <= 0xd7af)
782
decompose_hangul (wc, wc_buffer + n_wc, &result_len);
787
decomp = find_decomposition (wc, do_compat);
792
for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
793
wc_buffer[n_wc++] = g_utf8_get_char (pd);
796
wc_buffer[n_wc++] = wc;
801
cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
805
g_unicode_canonical_ordering (wc_buffer + last_start,
807
last_start = old_n_wc;
811
p = g_utf8_next_char (p);
816
g_unicode_canonical_ordering (wc_buffer + last_start,
823
/* All decomposed and reordered */
825
if (do_compose && n_wc > 0)
831
for (i = 0; i < n_wc; i++)
833
int cc = COMBINING_CLASS (wc_buffer[i]);
836
(last_cc == 0 || last_cc != cc) &&
837
combine (wc_buffer[last_start], wc_buffer[i],
838
&wc_buffer[last_start]))
840
for (j = i + 1; j < n_wc; j++)
841
wc_buffer[j - 1] = wc_buffer[j];
848
last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
867
* @str: a UTF-8 encoded string.
868
* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
869
* @mode: the type of normalization to perform.
871
* Converts a string into canonical form, standardizing
872
* such issues as whether a character with an accent
873
* is represented as a base character and combining
874
* accent or as a single precomposed character. You
875
* should generally call g_utf8_normalize() before
876
* comparing two Unicode strings.
878
* The normalization mode %G_NORMALIZE_DEFAULT only
879
* standardizes differences that do not affect the
880
* text content, such as the above-mentioned accent
881
* representation. %G_NORMALIZE_ALL also standardizes
882
* the "compatibility" characters in Unicode, such
883
* as SUPERSCRIPT THREE to the standard forms
884
* (in this case DIGIT THREE). Formatting information
885
* may be lost but for most text operations such
886
* characters should be considered the same.
887
* For example, g_utf8_collate() normalizes
888
* with %G_NORMALIZE_ALL as its first step.
890
* %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
891
* are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
892
* but returned a result with composed forms rather
893
* than a maximally decomposed form. This is often
894
* useful if you intend to convert the string to
895
* a legacy encoding or pass it to a system with
896
* less capable Unicode handling.
898
* Return value: a newly allocated string, that is the
899
* normalized form of @str.
902
g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
904
gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
907
result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
913
/* Public Libidn API starts here. */
916
* stringprep_utf8_to_unichar:
917
* @p: a pointer to Unicode character encoded as UTF-8
919
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
920
* If @p does not point to a valid UTF-8 encoded character, results are
921
* undefined. If you are not sure that the bytes are complete
922
* valid Unicode characters, you should use g_utf8_get_char_validated()
925
* Return value: the resulting character
928
stringprep_utf8_to_unichar (const char *p)
930
return g_utf8_get_char (p);
934
* stringprep_unichar_to_utf8:
935
* @c: a ISO10646 character code
936
* @outbuf: output buffer, must have at least 6 bytes of space.
937
* If %NULL, the length will be computed and returned
938
* and nothing will be written to @outbuf.
940
* Converts a single character to UTF-8.
942
* Return value: number of bytes written
945
stringprep_unichar_to_utf8 (my_uint32_t c, char *outbuf)
947
return g_unichar_to_utf8 (c, outbuf);
951
* stringprep_utf8_to_ucs4:
952
* @str: a UTF-8 encoded string
953
* @len: the maximum length of @str to use. If @len < 0, then
954
* the string is nul-terminated.
955
* @items_written: location to store the number of characters in the
958
* Convert a string from UTF-8 to a 32-bit fixed width
959
* representation as UCS-4, assuming valid UTF-8 input.
960
* This function does no error checking on the input.
962
* Return value: a pointer to a newly allocated UCS-4 string.
963
* This value must be freed with free().
966
stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
968
return g_utf8_to_ucs4_fast (str, len, items_written);
972
* stringprep_ucs4_to_utf8:
973
* @str: a UCS-4 encoded string
974
* @len: the maximum length of @str to use. If @len < 0, then
975
* the string is terminated with a 0 character.
976
* @items_read: location to store number of characters read read, or %NULL.
977
* @items_written: location to store number of bytes written or %NULL.
978
* The value here stored does not include the trailing 0
981
* Convert a string from a 32-bit fixed width representation as UCS-4.
982
* to UTF-8. The result will be terminated with a 0 byte.
984
* Return value: a pointer to a newly allocated UTF-8 string.
985
* This value must be freed with free(). If an
986
* error occurs, %NULL will be returned and
990
stringprep_ucs4_to_utf8 (const my_uint32_t * str, ssize_t len,
991
size_t * items_read, size_t * items_written)
993
return g_ucs4_to_utf8 (str, len, items_read,
994
items_written, NULL);
998
* stringprep_utf8_nfkc_normalize:
999
* @str: a UTF-8 encoded string.
1000
* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1002
* Converts a string into canonical form, standardizing
1003
* such issues as whether a character with an accent
1004
* is represented as a base character and combining
1005
* accent or as a single precomposed character.
1007
* The normalization mode is NFKC (ALL COMPOSE). It standardizes
1008
* differences that do not affect the text content, such as the
1009
* above-mentioned accent representation. It standardizes the
1010
* "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1011
* the standard forms (in this case DIGIT THREE). Formatting
1012
* information may be lost but for most text operations such
1013
* characters should be considered the same. It returns a result with
1014
* composed forms rather than a maximally decomposed form.
1016
* Return value: a newly allocated string, that is the
1017
* NFKC normalized form of @str.
1020
stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1022
return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1026
* stringprep_ucs4_nfkc_normalize:
1027
* @str: a Unicode string.
1028
* @len: length of @str array, or -1 if @str is nul-terminated.
1030
* Converts UCS4 string into UTF-8 and runs
1031
* stringprep_utf8_nfkc_normalize().
1033
* Return value: a newly allocated Unicode string, that is the NFKC
1034
* normalized form of @str.
1037
stringprep_ucs4_nfkc_normalize (my_uint32_t * str, ssize_t len)
1040
my_uint32_t *result_wc;
1042
p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1043
result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);