2
* Copyright (C) 1999 Tom Tromey
3
* Copyright (C) 2000 Red Hat, Inc.
4
* Copyright (C) 2014 Savoir-Faire Linux Inc.
6
* Author: Pascal Potvin <pascal.potvin@extenway.com>
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 3 of the License, or
11
* (at your option) any later version.
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22
* Additional permission under GNU GPL version 3 section 7:
24
* If you modify this program, or any covered work, by linking or
25
* combining it with the OpenSSL project's OpenSSL library (or a
26
* modified version of that library), containing parts covered by the
27
* terms of the OpenSSL or SSLeay licenses, Savoir-Faire Linux Inc.
28
* grants you additional permission to convey the resulting work.
29
* Corresponding Source for a non-source form of such a combination
30
* shall include the source code for the parts of OpenSSL used as well
31
* as that of the covered work.
37
#include "utf8_utils.h"
40
* The LIKELY and UNLIKELY macros let the programmer give hints to
41
* the compiler about the expected result of an expression. Some compilers
42
* can use this information for optimizations.
44
#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
45
#define LIKELY(expr) (__builtin_expect (expr, 1))
46
#define UNLIKELY(expr) (__builtin_expect (expr, 0))
48
#define LIKELY(expr) (expr)
49
#define UNLIKELY(expr) (expr)
54
* Check whether a Unicode (5.2) char is in a valid range.
56
* The first check comes from the Unicode guarantee to never encode
57
* a point above 0x0010ffff, since UTF-16 couldn't represent it.
59
* The second check covers surrogate pairs (category Cs).
61
* @param Char the character
63
#define UNICODE_VALID(Char) \
64
((Char) < 0x110000 && \
65
(((Char) & 0xFFFFF800) != 0xD800))
67
#define CONTINUATION_CHAR \
68
if ((*(unsigned char *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
71
val |= (*(unsigned char *)p) & 0x3f;
74
fast_validate(const char *str)
80
for (p = str; *p; p++) {
81
if (*(unsigned char *)p < 128)
88
if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
89
if (UNLIKELY((*(unsigned char *)p & 0x1e) == 0))
94
if (UNLIKELY((*(unsigned char *)p & 0xc0) != 0x80)) /* 10xxxxxx */
97
if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
99
val = *(unsigned char *)p & 0x0f;
101
} else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
103
val = *(unsigned char *)p & 0x07;
115
if (UNLIKELY(val < min))
118
if (UNLIKELY(!UNICODE_VALID(val)))
133
fast_validate_len(const char *str, ssize_t max_len)
139
assert(max_len >= 0);
141
for (p = str; ((p - str) < max_len) && *p; p++) {
142
if (*(unsigned char *)p < 128)
149
if ((*(unsigned char *)p & 0xe0) == 0xc0) { /* 110xxxxx */
150
if (UNLIKELY(max_len - (p - str) < 2))
153
if (UNLIKELY((*(unsigned char *)p & 0x1e) == 0))
158
if (UNLIKELY((*(unsigned char *)p & 0xc0) != 0x80)) /* 10xxxxxx */
161
if ((*(unsigned char *)p & 0xf0) == 0xe0) { /* 1110xxxx */
162
if (UNLIKELY(max_len - (p - str) < 3))
166
val = *(unsigned char *)p & 0x0f;
168
} else if ((*(unsigned char *)p & 0xf8) == 0xf0) { /* 11110xxx */
169
if (UNLIKELY(max_len - (p - str) < 4))
173
val = *(unsigned char *)p & 0x07;
185
if (UNLIKELY(val < min))
188
if (UNLIKELY(!UNICODE_VALID(val)))
203
* utf8_validate_c_str:
204
* @str: a pointer to character data
205
* @max_len: max bytes to validate, or -1 to go until NULL
206
* @end: return location for end of valid data
208
* Validates UTF-8 encoded text. @str is the text to validate;
209
* if @str is nul-terminated, then @max_len can be -1, otherwise
210
* @max_len should be the number of bytes to validate.
211
* If @end is non-%NULL, then the end of the valid range
212
* will be stored there (i.e. the start of the first invalid
213
* character if some bytes were invalid, or the end of the text
214
* being validated otherwise).
216
* Note that utf8_validate() returns %false if @max_len is
217
* positive and any of the @max_len bytes are nul.
219
* Returns true if all of @str was valid. Dbus requires valid UTF-8 as input;
220
* sip packets should also be encoded in utf8; so data read from a file or the
221
* network should be checked with utf8_validate() before doing anything else
224
* Returns: true if the text was valid UTF-8
227
utf8_validate_c_str(const char *str, ssize_t max_len, const char **end)
232
p = fast_validate(str);
234
p = fast_validate_len(str, max_len);
239
if ((max_len >= 0 && p != str + max_len) ||
240
(max_len < 0 && *p != '\0'))
247
utf8_validate(const std::string & str)
251
p = fast_validate(str.c_str());
257
utf8_make_valid(const std::string & name)
259
ssize_t remaining_bytes = name.size();
261
const char *remainder = name.c_str();
266
while (remaining_bytes != 0) {
267
if (utf8_validate_c_str(remainder, remaining_bytes, &invalid))
270
valid_bytes = invalid - remainder;
273
// If every byte is replaced by U+FFFD, max(strlen(string)) == 3 * name.size()
274
str = new char[3 * remaining_bytes];
278
strncpy(pos, remainder, valid_bytes);
281
/* append U+FFFD REPLACEMENT CHARACTER */
288
remaining_bytes -= valid_bytes + 1;
289
remainder = invalid + 1;
293
return std::string(name);
295
strncpy(pos, remainder, remaining_bytes);
296
pos += remaining_bytes;
298
std::string answer(str, pos - str);
299
assert(utf8_validate_c_str(answer.c_str(), -1, NULL));