1
/* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
2
/* cairo - a vector graphics library with display and print output
4
* The code in this file is derived from GLib's gutf8.c and
5
* ultimately from libunicode. It is relicensed under the
6
* dual LGPL/MPL with permission of the original authors.
8
* Copyright Ā© 1999 Tom Tromey
9
* Copyright Ā© 2005 Red Hat, Inc
11
* This library is free software; you can redistribute it and/or
12
* modify it either under the terms of the GNU Lesser General Public
13
* License version 2.1 as published by the Free Software Foundation
14
* (the "LGPL") or, at your option, under the terms of the Mozilla
15
* Public License Version 1.1 (the "MPL"). If you do not alter this
16
* notice, a recipient may use your version of this file under either
17
* the MPL or the LGPL.
19
* You should have received a copy of the LGPL along with this library
20
* in the file COPYING-LGPL-2.1; if not, write to the Free Software
21
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
* You should have received a copy of the MPL along with this library
23
* in the file COPYING-MPL-1.1
25
* The contents of this file are subject to the Mozilla Public License
26
* Version 1.1 (the "License"); you may not use this file except in
27
* compliance with the License. You may obtain a copy of the License at
28
* http://www.mozilla.org/MPL/
30
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31
* OF ANY KIND, either express or implied. See the LGPL or the MPL for
32
* the specific language governing rights and limitations.
34
* The Original Code is the cairo graphics library.
36
* The Initial Developer of the Original Code is Tom Tromey.
40
* Owen Taylor <otaylor@redhat.com>
45
#define UTF8_COMPUTE(Char, Mask, Len) \
51
else if ((Char & 0xe0) == 0xc0) \
56
else if ((Char & 0xf0) == 0xe0) \
61
else if ((Char & 0xf8) == 0xf0) \
66
else if ((Char & 0xfc) == 0xf8) \
71
else if ((Char & 0xfe) == 0xfc) \
79
#define UTF8_LENGTH(Char) \
80
((Char) < 0x80 ? 1 : \
81
((Char) < 0x800 ? 2 : \
82
((Char) < 0x10000 ? 3 : \
83
((Char) < 0x200000 ? 4 : \
84
((Char) < 0x4000000 ? 5 : 6)))))
86
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
87
(Result) = (Chars)[0] & (Mask); \
88
for ((Count) = 1; (Count) < (Len); ++(Count)) \
90
if (((Chars)[(Count)] & 0xc0) != 0x80) \
96
(Result) |= ((Chars)[(Count)] & 0x3f); \
99
#define UNICODE_VALID(Char) \
100
((Char) < 0x110000 && \
101
(((Char) & 0xFFFFF800) != 0xD800) && \
102
((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
103
((Char) & 0xFFFE) != 0xFFFE)
105
static const char utf8_skip_data[256] = {
106
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
107
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
113
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
116
#define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
118
/* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
119
* If @p does not point to a valid UTF-8 encoded character, results are
123
_utf8_get_char (const unsigned char *p)
125
int i, mask = 0, len;
127
unsigned char c = (unsigned char) *p;
129
UTF8_COMPUTE (c, mask, len);
132
UTF8_GET (result, p, i, mask, len);
137
/* Like _utf8_get_char, but take a maximum length
138
* and return (uint32_t)-2 on incomplete trailing character
141
_utf8_get_char_extended (const unsigned char *p,
145
uint32_t wc = (unsigned char) *p;
149
} else if (wc < 0xc0) {
151
} else if (wc < 0xe0) {
154
} else if (wc < 0xf0) {
157
} else if (wc < 0xf8) {
160
} else if (wc < 0xfc) {
163
} else if (wc < 0xfe) {
170
if (max_len >= 0 && len > max_len) {
171
for (i = 1; i < max_len; i++) {
172
if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
178
for (i = 1; i < len; ++i) {
179
uint32_t ch = ((unsigned char *)p)[i];
181
if ((ch & 0xc0) != 0x80) {
192
if (UTF8_LENGTH(wc) != len)
199
* _cairo_utf8_get_char_validated:
201
* @unicode: location to store one Unicode character
203
* Decodes the first character of a valid UTF-8 string, and returns
204
* the number of bytes consumed.
206
* Note that the string should be valid. Do not use this without
207
* validating the string first.
209
* Returns: the number of bytes forming the character returned.
212
_cairo_utf8_get_char_validated (const char *p,
215
int i, mask = 0, len;
217
unsigned char c = (unsigned char) *p;
219
UTF8_COMPUTE (c, mask, len);
222
*unicode = (uint32_t)-1;
225
UTF8_GET (result, p, i, mask, len);
233
* _cairo_utf8_to_utf32:
234
* @str: an UTF-8 string
235
* @len: length of @str in bytes, or -1 if it is nul-terminated.
236
* If @len is supplied and the string has an embedded nul
237
* byte, only the portion before the nul byte is converted.
238
* @result: location to store a pointer to a newly allocated UTF-32
239
* string (always native endian), or %NULL. Free with free(). A 0
240
* word will be written after the last character.
241
* @items_written: location to store number of 32-bit words
242
* written. (Not including the trailing 0)
244
* Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
245
* with 1 32-bit word per character. The string is validated to
246
* consist entirely of valid Unicode characters.
248
* Return value: %CAIRO_STATUS_SUCCESS if the entire string was
249
* successfully converted. %CAIRO_STATUS_INVALID_STRING if an
250
* an invalid sequence was found.
253
_cairo_utf8_to_ucs4 (const char *str,
258
uint32_t *str32 = NULL;
260
const unsigned char *in;
261
const unsigned char * const ustr = (const unsigned char *) str;
265
while ((len < 0 || ustr + len - in > 0) && *in)
267
uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
268
if (wc & 0x80000000 || !UNICODE_VALID (wc))
269
return _cairo_error (CAIRO_STATUS_INVALID_STRING);
272
if (n_chars == INT_MAX)
273
return _cairo_error (CAIRO_STATUS_INVALID_STRING);
275
in = UTF8_NEXT_CHAR (in);
279
str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
281
return _cairo_error (CAIRO_STATUS_NO_MEMORY);
284
for (i=0; i < n_chars; i++) {
285
str32[i] = _utf8_get_char (in);
286
in = UTF8_NEXT_CHAR (in);
294
*items_written = n_chars;
296
return CAIRO_STATUS_SUCCESS;
300
* _cairo_ucs4_to_utf8:
301
* @unicode: a UCS-4 character
302
* @utf8: buffer to write utf8 string into. Must have at least 4 bytes
303
* space available. Or %NULL.
305
* Return value: Number of bytes in the utf8 string or 0 if an invalid
309
_cairo_ucs4_to_utf8 (uint32_t unicode,
315
if (unicode < 0x80) {
319
} else if (unicode < 0x800) {
321
} else if (unicode < 0x10000) {
323
} else if (unicode < 0x200000) {
334
*--p = 0x80 | (unicode & 0x3f);
337
*p |= 0xf0 << (4 - bytes);
342
#if CAIRO_HAS_UTF8_TO_UTF16
344
* _cairo_utf8_to_utf16:
345
* @str: an UTF-8 string
346
* @len: length of @str in bytes, or -1 if it is nul-terminated.
347
* If @len is supplied and the string has an embedded nul
348
* byte, only the portion before the nul byte is converted.
349
* @result: location to store a pointer to a newly allocated UTF-16
350
* string (always native endian). Free with free(). A 0
351
* word will be written after the last character.
352
* @items_written: location to store number of 16-bit words
353
* written. (Not including the trailing 0)
355
* Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
356
* where characters are represented either as a single 16-bit word, or
357
* as a pair of 16-bit "surrogates". The string is validated to
358
* consist entirely of valid Unicode characters.
360
* Return value: %CAIRO_STATUS_SUCCESS if the entire string was
361
* successfully converted. %CAIRO_STATUS_INVALID_STRING if an
362
* an invalid sequence was found.
365
_cairo_utf8_to_utf16 (const char *str,
370
uint16_t *str16 = NULL;
372
const unsigned char *in;
373
const unsigned char * const ustr = (const unsigned char *) str;
377
while ((len < 0 || ustr + len - in > 0) && *in) {
378
uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
379
if (wc & 0x80000000 || !UNICODE_VALID (wc))
380
return _cairo_error (CAIRO_STATUS_INVALID_STRING);
387
if (n16 == INT_MAX - 1 || n16 == INT_MAX)
388
return _cairo_error (CAIRO_STATUS_INVALID_STRING);
390
in = UTF8_NEXT_CHAR (in);
393
str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
395
return _cairo_error (CAIRO_STATUS_NO_MEMORY);
398
for (i = 0; i < n16;) {
399
uint32_t wc = _utf8_get_char (in);
404
str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
405
str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
408
in = UTF8_NEXT_CHAR (in);
415
*items_written = n16;
417
return CAIRO_STATUS_SUCCESS;