1
/* gutf8.c - Operations on UTF-8 strings.
3
* Copyright (C) 1999 Tom Tromey
4
* Copyright (C) 2000 Red Hat, Inc.
6
* This library is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2 of the License, or (at your option) any later version.
11
* This library is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with this library; if not, write to the
18
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19
* Boston, MA 02111-1307, USA.
32
#ifdef G_PLATFORM_WIN32
39
#include "libcharset/libcharset.h"
44
#define UTF8_COMPUTE(Char, Mask, Len) \
50
else if ((Char & 0xe0) == 0xc0) \
55
else if ((Char & 0xf0) == 0xe0) \
60
else if ((Char & 0xf8) == 0xf0) \
65
else if ((Char & 0xfc) == 0xf8) \
70
else if ((Char & 0xfe) == 0xfc) \
78
#define UTF8_LENGTH(Char) \
79
((Char) < 0x80 ? 1 : \
80
((Char) < 0x800 ? 2 : \
81
((Char) < 0x10000 ? 3 : \
82
((Char) < 0x200000 ? 4 : \
83
((Char) < 0x4000000 ? 5 : 6)))))
86
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
87
(Result) = (Chars)[0] & (Mask); \
88
for ((Count) = 1; (Count) < (Len); ++(Count)) \
90
if (((Chars)[(Count)] & 0xc0) != 0x80) \
96
(Result) |= ((Chars)[(Count)] & 0x3f); \
99
#define UNICODE_VALID(Char) \
100
((Char) < 0x110000 && \
101
(((Char) & 0xFFFFF800) != 0xD800) && \
102
((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
103
((Char) & 0xFFFE) != 0xFFFE)
106
static const gchar utf8_skip_data[256] = {
107
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
113
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
114
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
117
const gchar * const g_utf8_skip = utf8_skip_data;
120
* g_utf8_find_prev_char:
121
* @str: pointer to the beginning of a UTF-8 encoded string
122
* @p: pointer to some position within @str
124
* Given a position @p with a UTF-8 encoded string @str, find the start
125
* of the previous UTF-8 character starting before @p. Returns %NULL if no
126
* UTF-8 characters are present in @str before @p.
128
* @p does not have to be at the beginning of a UTF-8 character. No check
129
* is made to see if the character found is actually valid other than
130
* it starts with an appropriate byte.
132
* Return value: a pointer to the found character or %NULL.
135
g_utf8_find_prev_char (const char *str,
138
for (--p; p >= str; --p)
140
if ((*p & 0xc0) != 0x80)
147
* g_utf8_find_next_char:
148
* @p: a pointer to a position within a UTF-8 encoded string
149
* @end: a pointer to the end of the string, or %NULL to indicate
150
* that the string is nul-terminated, in which case
151
* the returned value will be
153
* Finds the start of the next UTF-8 character in the string after @p.
155
* @p does not have to be at the beginning of a UTF-8 character. No check
156
* is made to see if the character found is actually valid other than
157
* it starts with an appropriate byte.
159
* Return value: a pointer to the found character or %NULL
162
g_utf8_find_next_char (const gchar *p,
168
for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
171
for (++p; (*p & 0xc0) == 0x80; ++p)
174
return (p == end) ? NULL : (gchar *)p;
179
* @p: a pointer to a position within a UTF-8 encoded string
181
* Finds the previous UTF-8 character in the string before @p.
183
* @p does not have to be at the beginning of a UTF-8 character. No check
184
* is made to see if the character found is actually valid other than
185
* it starts with an appropriate byte. If @p might be the first
186
* character of the string, you must use g_utf8_find_prev_char() instead.
188
* Return value: a pointer to the found character.
191
g_utf8_prev_char (const gchar *p)
196
if ((*p & 0xc0) != 0x80)
203
* @p: pointer to the start of a UTF-8 encoded string.
204
* @max: the maximum number of bytes to examine. If @max
205
* is less than 0, then the string is assumed to be
206
* nul-terminated. If @max is 0, @p will not be examined and
209
* Returns the length of the string in characters.
211
* Return value: the length of the string in characters
214
g_utf8_strlen (const gchar *p,
218
const gchar *start = p;
219
g_return_val_if_fail (p != NULL || max == 0, 0);
225
p = g_utf8_next_char (p);
234
p = g_utf8_next_char (p);
236
while (p - start < max && *p)
239
p = g_utf8_next_char (p);
242
/* only do the last len increment if we got a complete
243
* char (don't count partial chars)
245
if (p - start <= max)
254
* @p: a pointer to Unicode character encoded as UTF-8
256
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
257
* If @p does not point to a valid UTF-8 encoded character, results are
258
* undefined. If you are not sure that the bytes are complete
259
* valid Unicode characters, you should use g_utf8_get_char_validated()
262
* Return value: the resulting character
265
g_utf8_get_char (const gchar *p)
267
int i, mask = 0, len;
269
unsigned char c = (unsigned char) *p;
271
UTF8_COMPUTE (c, mask, len);
274
UTF8_GET (result, p, i, mask, len);
280
* g_utf8_offset_to_pointer:
281
* @str: a UTF-8 encoded string
282
* @offset: a character offset within @str
284
* Converts from an integer character offset to a pointer to a position
287
* Since 2.10, this function allows to pass a negative @offset to
288
* step backwards. It is usually worth stepping backwards from the end
289
* instead of forwards if @offset is in the last fourth of the string,
290
* since moving forward is about 3 times faster than moving backward.
292
* Return value: the resulting pointer
295
g_utf8_offset_to_pointer (const gchar *str,
298
const gchar *s = str;
302
s = g_utf8_next_char (s);
307
/* This nice technique for fast backwards stepping
308
* through a UTF-8 string was dubbed "stutter stepping"
309
* by its inventor, Larry Ewing.
315
while ((*s & 0xc0) == 0x80)
318
offset += g_utf8_pointer_to_offset (s, s1);
326
* g_utf8_pointer_to_offset:
327
* @str: a UTF-8 encoded string
328
* @pos: a pointer to a position within @str
330
* Converts from a pointer to position within a string to a integer
333
* Since 2.10, this function allows @pos to be before @str, and returns
334
* a negative offset in this case.
336
* Return value: the resulting character offset
339
g_utf8_pointer_to_offset (const gchar *str,
342
const gchar *s = str;
346
offset = - g_utf8_pointer_to_offset (pos, str);
350
s = g_utf8_next_char (s);
360
* @dest: buffer to fill with characters from @src
361
* @src: UTF-8 encoded string
362
* @n: character count
364
* Like the standard C strncpy() function, but
365
* copies a given number of characters instead of a given number of
366
* bytes. The @src string must be valid UTF-8 encoded text.
367
* (Use g_utf8_validate() on all text before trying to use UTF-8
368
* utility functions with it.)
370
* Return value: @dest
373
g_utf8_strncpy (gchar *dest,
377
const gchar *s = src;
380
s = g_utf8_next_char(s);
383
strncpy(dest, src, s - src);
388
G_LOCK_DEFINE_STATIC (aliases);
391
get_alias_hash (void)
393
static GHashTable *alias_hash = NULL;
400
alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
402
aliases = _g_locale_get_charset_aliases ();
403
while (*aliases != '\0')
405
const char *canonical;
407
const char **alias_array;
411
aliases += strlen (aliases) + 1;
413
aliases += strlen (aliases) + 1;
415
alias_array = g_hash_table_lookup (alias_hash, canonical);
418
while (alias_array[count])
422
alias_array = g_renew (const char *, alias_array, count + 2);
423
alias_array[count] = alias;
424
alias_array[count + 1] = NULL;
426
g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
435
/* As an abuse of the alias table, the following routines gets
436
* the charsets that are aliases for the canonical name.
438
G_GNUC_INTERNAL const char **
439
_g_charset_get_aliases (const char *canonical_name)
441
GHashTable *alias_hash = get_alias_hash ();
443
return g_hash_table_lookup (alias_hash, canonical_name);
447
g_utf8_get_charset_internal (const char *raw_data,
450
const char *charset = getenv("CHARSET");
452
if (charset && *charset)
456
if (charset && strstr (charset, "UTF-8"))
462
/* The libcharset code tries to be thread-safe without
463
* a lock, but has a memory leak and a missing memory
464
* barrier, so we lock for it
467
charset = _g_locale_charset_unalias (raw_data);
470
if (charset && *charset)
474
if (charset && strstr (charset, "UTF-8"))
480
/* Assume this for compatibility at present. */
486
typedef struct _GCharsetCache GCharsetCache;
488
struct _GCharsetCache {
495
charset_cache_free (gpointer data)
497
GCharsetCache *cache = data;
499
g_free (cache->charset);
505
* @charset: return location for character set name
507
* Obtains the character set for the current locale; you might use
508
* this character set as an argument to g_convert(), to convert from
509
* the current locale's encoding to some other encoding. (Frequently
510
* g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts,
513
* The return value is %TRUE if the locale's encoding is UTF-8, in that
514
* case you can perhaps avoid calling g_convert().
516
* The string returned in @charset is not allocated, and should not be
519
* Return value: %TRUE if the returned charset is UTF-8
522
g_get_charset (G_CONST_RETURN char **charset)
524
static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
525
GCharsetCache *cache = g_static_private_get (&cache_private);
530
cache = g_new0 (GCharsetCache, 1);
531
g_static_private_set (&cache_private, cache, charset_cache_free);
534
raw = _g_locale_charset_raw ();
536
if (!(cache->raw && strcmp (cache->raw, raw) == 0))
538
const gchar *new_charset;
541
g_free (cache->charset);
542
cache->raw = g_strdup (raw);
543
cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
544
cache->charset = g_strdup (new_charset);
548
*charset = cache->charset;
550
return cache->is_utf8;
557
* @c: a Unicode character code
558
* @outbuf: output buffer, must have at least 6 bytes of space.
559
* If %NULL, the length will be computed and returned
560
* and nothing will be written to @outbuf.
562
* Converts a single character to UTF-8.
564
* Return value: number of bytes written
567
g_unichar_to_utf8 (gunichar c,
570
/* If this gets modified, also update the copy in g_string_insert_unichar() */
585
else if (c < 0x10000)
590
else if (c < 0x200000)
595
else if (c < 0x4000000)
608
for (i = len - 1; i > 0; --i)
610
outbuf[i] = (c & 0x3f) | 0x80;
613
outbuf[0] = c | first;
621
* @p: a nul-terminated UTF-8 encoded string
622
* @len: the maximum length of @p
623
* @c: a Unicode character
625
* Finds the leftmost occurrence of the given Unicode character
626
* in a UTF-8 encoded string, while limiting the search to @len bytes.
627
* If @len is -1, allow unbounded search.
629
* Return value: %NULL if the string does not contain the character,
630
* otherwise, a pointer to the start of the leftmost occurrence of
631
* the character in the string.
634
g_utf8_strchr (const char *p,
640
gint charlen = g_unichar_to_utf8 (c, ch);
643
return g_strstr_len (p, len, ch);
649
* @p: a nul-terminated UTF-8 encoded string
650
* @len: the maximum length of @p
651
* @c: a Unicode character
653
* Find the rightmost occurrence of the given Unicode character
654
* in a UTF-8 encoded string, while limiting the search to @len bytes.
655
* If @len is -1, allow unbounded search.
657
* Return value: %NULL if the string does not contain the character,
658
* otherwise, a pointer to the start of the rightmost occurrence of the
659
* character in the string.
662
g_utf8_strrchr (const char *p,
668
gint charlen = g_unichar_to_utf8 (c, ch);
671
return g_strrstr_len (p, len, ch);
675
/* Like g_utf8_get_char, but take a maximum length
676
* and return (gunichar)-2 on incomplete trailing character
678
static inline gunichar
679
g_utf8_get_char_extended (const gchar *p,
683
gunichar wc = (guchar) *p;
723
if (max_len >= 0 && len > max_len)
725
for (i = 1; i < max_len; i++)
727
if ((((guchar *)p)[i] & 0xc0) != 0x80)
733
for (i = 1; i < len; ++i)
735
gunichar ch = ((guchar *)p)[i];
737
if ((ch & 0xc0) != 0x80)
749
if (UTF8_LENGTH(wc) != len)
756
* g_utf8_get_char_validated:
757
* @p: a pointer to Unicode character encoded as UTF-8
758
* @max_len: the maximum number of bytes to read, or -1, for no maximum.
760
* Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
761
* This function checks for incomplete characters, for invalid characters
762
* such as characters that are out of the range of Unicode, and for
763
* overlong encodings of valid characters.
765
* Return value: the resulting character. If @p points to a partial
766
* sequence at the end of a string that could begin a valid
767
* character (or if @max_len is zero), returns (gunichar)-2;
768
* otherwise, if @p does not point to a valid UTF-8 encoded
769
* Unicode character, returns (gunichar)-1.
772
g_utf8_get_char_validated (const gchar *p,
780
result = g_utf8_get_char_extended (p, max_len);
782
if (result & 0x80000000)
784
else if (!UNICODE_VALID (result))
791
* g_utf8_to_ucs4_fast:
792
* @str: a UTF-8 encoded string
793
* @len: the maximum length of @str to use. If @len < 0, then
794
* the string is nul-terminated.
795
* @items_written: location to store the number of characters in the
798
* Convert a string from UTF-8 to a 32-bit fixed width
799
* representation as UCS-4, assuming valid UTF-8 input.
800
* This function is roughly twice as fast as g_utf8_to_ucs4()
801
* but does no error checking on the input.
803
* Return value: a pointer to a newly allocated UCS-4 string.
804
* This value must be freed with g_free().
807
g_utf8_to_ucs4_fast (const gchar *str,
809
glong *items_written)
816
g_return_val_if_fail (str != NULL, NULL);
824
p = g_utf8_next_char (p);
830
while (p < str + len && *p)
832
p = g_utf8_next_char (p);
837
result = g_new (gunichar, n_chars + 1);
840
for (i=0; i < n_chars; i++)
842
gunichar wc = ((unsigned char *)p)[0];
877
for (j = 1; j < charlen; j++)
880
wc |= ((unsigned char *)p)[j] & 0x3f;
897
* @str: a UTF-8 encoded string
898
* @len: the maximum length of @str to use. If @len < 0, then
899
* the string is nul-terminated.
900
* @items_read: location to store number of bytes read, or %NULL.
901
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
902
* returned in case @str contains a trailing partial
903
* character. If an error occurs then the index of the
904
* invalid input is stored here.
905
* @items_written: location to store number of characters written or %NULL.
906
* The value here stored does not include the trailing 0
908
* @error: location to store the error occuring, or %NULL to ignore
909
* errors. Any of the errors in #GConvertError other than
910
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
912
* Convert a string from UTF-8 to a 32-bit fixed width
913
* representation as UCS-4. A trailing 0 will be added to the
914
* string after the converted text.
916
* Return value: a pointer to a newly allocated UCS-4 string.
917
* This value must be freed with g_free(). If an
918
* error occurs, %NULL will be returned and
922
g_utf8_to_ucs4 (const gchar *str,
925
glong *items_written,
928
gunichar *result = NULL;
934
while ((len < 0 || str + len - in > 0) && *in)
936
gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
939
if (wc == (gunichar)-2)
944
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
945
_("Partial character sequence at end of input"));
948
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
949
_("Invalid byte sequence in conversion input"));
956
in = g_utf8_next_char (in);
959
result = g_new (gunichar, n_chars + 1);
962
for (i=0; i < n_chars; i++)
964
result[i] = g_utf8_get_char (in);
965
in = g_utf8_next_char (in);
970
*items_written = n_chars;
974
*items_read = in - str;
981
* @str: a UCS-4 encoded string
982
* @len: the maximum length (number of characters) of @str to use.
983
* If @len < 0, then the string is terminated with a 0 character.
984
* @items_read: location to store number of characters read, or %NULL.
985
* @items_written: location to store number of bytes written or %NULL.
986
* The value here stored does not include the trailing 0
988
* @error: location to store the error occuring, or %NULL to ignore
989
* errors. Any of the errors in #GConvertError other than
990
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
992
* Convert a string from a 32-bit fixed width representation as UCS-4.
993
* to UTF-8. The result will be terminated with a 0 byte.
995
* Return value: a pointer to a newly allocated UTF-8 string.
996
* This value must be freed with g_free(). If an
997
* error occurs, %NULL will be returned and
998
* @error set. In that case, @items_read will be
999
* set to the position of the first invalid input
1003
g_ucs4_to_utf8 (const gunichar *str,
1006
glong *items_written,
1010
gchar *result = NULL;
1015
for (i = 0; len < 0 || i < len ; i++)
1020
if (str[i] >= 0x80000000)
1022
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1023
_("Character out of range for UTF-8"));
1027
result_length += UTF8_LENGTH (str[i]);
1030
result = g_malloc (result_length + 1);
1034
while (p < result + result_length)
1035
p += g_unichar_to_utf8 (str[i++], p);
1040
*items_written = p - result;
1049
#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
1053
* @str: a UTF-16 encoded string
1054
* @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
1055
* If @len < 0, then the string is terminated with a 0 character.
1056
* @items_read: location to store number of words read, or %NULL.
1057
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1058
* returned in case @str contains a trailing partial
1059
* character. If an error occurs then the index of the
1060
* invalid input is stored here.
1061
* @items_written: location to store number of bytes written, or %NULL.
1062
* The value stored here does not include the trailing
1064
* @error: location to store the error occuring, or %NULL to ignore
1065
* errors. Any of the errors in #GConvertError other than
1066
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
1068
* Convert a string from UTF-16 to UTF-8. The result will be
1069
* terminated with a 0 byte.
1071
* Note that the input is expected to be already in native endianness,
1072
* an initial byte-order-mark character is not handled specially.
1073
* g_convert() can be used to convert a byte buffer of UTF-16 data of
1074
* ambiguous endianess.
1076
* Return value: a pointer to a newly allocated UTF-8 string.
1077
* This value must be freed with g_free(). If an
1078
* error occurs, %NULL will be returned and
1082
g_utf16_to_utf8 (const gunichar2 *str,
1085
glong *items_written,
1088
/* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
1091
const gunichar2 *in;
1093
gchar *result = NULL;
1095
gunichar high_surrogate;
1097
g_return_val_if_fail (str != 0, NULL);
1102
while ((len < 0 || in - str < len) && *in)
1107
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1111
wc = SURROGATE_VALUE (high_surrogate, c);
1116
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1117
_("Invalid sequence in conversion input"));
1125
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1126
_("Invalid sequence in conversion input"));
1130
if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1139
/********** DIFFERENT for UTF8/UCS4 **********/
1140
n_bytes += UTF8_LENGTH (wc);
1146
if (high_surrogate && !items_read)
1148
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1149
_("Partial character sequence at end of input"));
1153
/* At this point, everything is valid, and we just need to convert
1155
/********** DIFFERENT for UTF8/UCS4 **********/
1156
result = g_malloc (n_bytes + 1);
1161
while (out < result + n_bytes)
1166
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1168
wc = SURROGATE_VALUE (high_surrogate, c);
1171
else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1179
/********** DIFFERENT for UTF8/UCS4 **********/
1180
out += g_unichar_to_utf8 (wc, out);
1186
/********** DIFFERENT for UTF8/UCS4 **********/
1190
/********** DIFFERENT for UTF8/UCS4 **********/
1191
*items_written = out - result;
1195
*items_read = in - str;
1202
* @str: a UTF-16 encoded string
1203
* @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
1204
* If @len < 0, then the string is terminated with a 0 character.
1205
* @items_read: location to store number of words read, or %NULL.
1206
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1207
* returned in case @str contains a trailing partial
1208
* character. If an error occurs then the index of the
1209
* invalid input is stored here.
1210
* @items_written: location to store number of characters written, or %NULL.
1211
* The value stored here does not include the trailing
1213
* @error: location to store the error occuring, or %NULL to ignore
1214
* errors. Any of the errors in #GConvertError other than
1215
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
1217
* Convert a string from UTF-16 to UCS-4. The result will be
1218
* terminated with a 0 character.
1220
* Return value: a pointer to a newly allocated UCS-4 string.
1221
* This value must be freed with g_free(). If an
1222
* error occurs, %NULL will be returned and
1226
g_utf16_to_ucs4 (const gunichar2 *str,
1229
glong *items_written,
1232
const gunichar2 *in;
1234
gchar *result = NULL;
1236
gunichar high_surrogate;
1238
g_return_val_if_fail (str != 0, NULL);
1243
while ((len < 0 || in - str < len) && *in)
1248
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1252
wc = SURROGATE_VALUE (high_surrogate, c);
1257
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1258
_("Invalid sequence in conversion input"));
1266
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1267
_("Invalid sequence in conversion input"));
1271
if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1280
/********** DIFFERENT for UTF8/UCS4 **********/
1281
n_bytes += sizeof (gunichar);
1287
if (high_surrogate && !items_read)
1289
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1290
_("Partial character sequence at end of input"));
1294
/* At this point, everything is valid, and we just need to convert
1296
/********** DIFFERENT for UTF8/UCS4 **********/
1297
result = g_malloc (n_bytes + 4);
1302
while (out < result + n_bytes)
1307
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1309
wc = SURROGATE_VALUE (high_surrogate, c);
1312
else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1320
/********** DIFFERENT for UTF8/UCS4 **********/
1321
*(gunichar *)out = wc;
1322
out += sizeof (gunichar);
1328
/********** DIFFERENT for UTF8/UCS4 **********/
1329
*(gunichar *)out = 0;
1332
/********** DIFFERENT for UTF8/UCS4 **********/
1333
*items_written = (out - result) / sizeof (gunichar);
1337
*items_read = in - str;
1339
return (gunichar *)result;
1344
* @str: a UTF-8 encoded string
1345
* @len: the maximum length (number of characters) of @str to use.
1346
* If @len < 0, then the string is nul-terminated.
1347
* @items_read: location to store number of bytes read, or %NULL.
1348
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1349
* returned in case @str contains a trailing partial
1350
* character. If an error occurs then the index of the
1351
* invalid input is stored here.
1352
* @items_written: location to store number of <type>gunichar2</type> written,
1354
* The value stored here does not include the trailing 0.
1355
* @error: location to store the error occuring, or %NULL to ignore
1356
* errors. Any of the errors in #GConvertError other than
1357
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
1359
* Convert a string from UTF-8 to UTF-16. A 0 character will be
1360
* added to the result after the converted text.
1362
* Return value: a pointer to a newly allocated UTF-16 string.
1363
* This value must be freed with g_free(). If an
1364
* error occurs, %NULL will be returned and
1368
g_utf8_to_utf16 (const gchar *str,
1371
glong *items_written,
1374
gunichar2 *result = NULL;
1379
g_return_val_if_fail (str != NULL, NULL);
1383
while ((len < 0 || str + len - in > 0) && *in)
1385
gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
1386
if (wc & 0x80000000)
1388
if (wc == (gunichar)-2)
1393
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1394
_("Partial character sequence at end of input"));
1397
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1398
_("Invalid byte sequence in conversion input"));
1405
else if (wc < 0xe000)
1407
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1408
_("Invalid sequence in conversion input"));
1412
else if (wc < 0x10000)
1414
else if (wc < 0x110000)
1418
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1419
_("Character out of range for UTF-16"));
1424
in = g_utf8_next_char (in);
1427
result = g_new (gunichar2, n16 + 1);
1430
for (i = 0; i < n16;)
1432
gunichar wc = g_utf8_get_char (in);
1440
result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1441
result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1444
in = g_utf8_next_char (in);
1450
*items_written = n16;
1454
*items_read = in - str;
1461
* @str: a UCS-4 encoded string
1462
* @len: the maximum length (number of characters) of @str to use.
1463
* If @len < 0, then the string is terminated with a 0 character.
1464
* @items_read: location to store number of bytes read, or %NULL.
1465
* If an error occurs then the index of the invalid input
1467
* @items_written: location to store number of <type>gunichar2</type>
1468
* written, or %NULL. The value stored here does not
1469
* include the trailing 0.
1470
* @error: location to store the error occuring, or %NULL to ignore
1471
* errors. Any of the errors in #GConvertError other than
1472
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
1474
* Convert a string from UCS-4 to UTF-16. A 0 character will be
1475
* added to the result after the converted text.
1477
* Return value: a pointer to a newly allocated UTF-16 string.
1478
* This value must be freed with g_free(). If an
1479
* error occurs, %NULL will be returned and
1483
g_ucs4_to_utf16 (const gunichar *str,
1486
glong *items_written,
1489
gunichar2 *result = NULL;
1495
while ((len < 0 || i < len) && str[i])
1497
gunichar wc = str[i];
1501
else if (wc < 0xe000)
1503
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1504
_("Invalid sequence in conversion input"));
1508
else if (wc < 0x10000)
1510
else if (wc < 0x110000)
1514
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1515
_("Character out of range for UTF-16"));
1523
result = g_new (gunichar2, n16 + 1);
1525
for (i = 0, j = 0; j < n16; i++)
1527
gunichar wc = str[i];
1535
result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1536
result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1542
*items_written = n16;
1551
#define CONTINUATION_CHAR \
1553
if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
1556
val |= (*(guchar *)p) & 0x3f; \
1559
static const gchar *
1560
fast_validate (const char *str)
1567
for (p = str; *p; p++)
1569
if (*(guchar *)p < 128)
1576
if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1578
if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1581
if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1586
if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1589
val = *(guchar *)p & 0x0f;
1592
else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1595
val = *(guchar *)p & 0x07;
1608
if (G_UNLIKELY (val < min))
1611
if (G_UNLIKELY (!UNICODE_VALID(val)))
1625
static const gchar *
1626
fast_validate_len (const char *str,
1634
for (p = str; (max_len < 0 || (p - str) < max_len) && *p; p++)
1636
if (*(guchar *)p < 128)
1643
if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1645
if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 2))
1648
if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1651
if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1656
if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1658
if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 3))
1662
val = *(guchar *)p & 0x0f;
1665
else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1667
if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 4))
1671
val = *(guchar *)p & 0x07;
1684
if (G_UNLIKELY (val < min))
1686
if (G_UNLIKELY (!UNICODE_VALID(val)))
1702
* @str: a pointer to character data
1703
* @max_len: max bytes to validate, or -1 to go until NUL
1704
* @end: return location for end of valid data
1706
* Validates UTF-8 encoded text. @str is the text to validate;
1707
* if @str is nul-terminated, then @max_len can be -1, otherwise
1708
* @max_len should be the number of bytes to validate.
1709
* If @end is non-%NULL, then the end of the valid range
1710
* will be stored there (i.e. the start of the first invalid
1711
* character if some bytes were invalid, or the end of the text
1712
* being validated otherwise).
1714
* Note that g_utf8_validate() returns %FALSE if @max_len is
1715
* positive and NUL is met before @max_len bytes have been read.
1717
* Returns %TRUE if all of @str was valid. Many GLib and GTK+
1718
* routines <emphasis>require</emphasis> valid UTF-8 as input;
1719
* so data read from a file or the network should be checked
1720
* with g_utf8_validate() before doing anything else with it.
1722
* Return value: %TRUE if the text was valid UTF-8
1725
g_utf8_validate (const char *str,
1733
p = fast_validate (str);
1735
p = fast_validate_len (str, max_len);
1740
if ((max_len >= 0 && p != str + max_len) ||
1741
(max_len < 0 && *p != '\0'))
1749
* g_unichar_validate:
1750
* @ch: a Unicode character
1752
* Checks whether @ch is a valid Unicode character. Some possible
1753
* integer values of @ch will not be valid. 0 is considered a valid
1754
* character, though it's normally a string terminator.
1756
* Return value: %TRUE if @ch is a valid Unicode character
1759
g_unichar_validate (gunichar ch)
1761
return UNICODE_VALID (ch);
1765
* g_utf8_strreverse:
1766
* @str: a UTF-8 encoded string
1767
* @len: the maximum length of @str to use. If @len < 0, then
1768
* the string is nul-terminated.
1770
* Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.
1771
* (Use g_utf8_validate() on all text before trying to use UTF-8
1772
* utility functions with it.)
1774
* Note that unlike g_strreverse(), this function returns
1775
* newly-allocated memory, which should be freed with g_free() when
1778
* Returns: a newly-allocated string which is the reverse of @str.
1783
g_utf8_strreverse (const gchar *str,
1793
result = g_new (gchar, len + 1);
1798
skip = g_utf8_skip[*(guchar*)p];
1800
for (m = r; skip; skip--)
1810
_g_utf8_make_valid (const gchar *name)
1813
const gchar *remainder, *invalid;
1814
gint remaining_bytes, valid_bytes;
1818
remaining_bytes = strlen (name);
1820
while (remaining_bytes != 0)
1822
if (g_utf8_validate (remainder, remaining_bytes, &invalid))
1824
valid_bytes = invalid - remainder;
1827
string = g_string_sized_new (remaining_bytes);
1829
g_string_append_len (string, remainder, valid_bytes);
1830
/* append U+FFFD REPLACEMENT CHARACTER */
1831
g_string_append (string, "\357\277\275");
1833
remaining_bytes -= valid_bytes + 1;
1834
remainder = invalid + 1;
1838
return g_strdup (name);
1840
g_string_append (string, remainder);
1842
g_assert (g_utf8_validate (string->str, -1, NULL));
1844
return g_string_free (string, FALSE);
1848
#define __G_UTF8_C__
1849
#include "galiasdef.c"