1
/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4
This file is part of systemd.
6
Copyright 2012 Lennart Poettering
8
systemd is free software; you can redistribute it and/or modify it
9
under the terms of the GNU Lesser General Public License as published by
10
the Free Software Foundation; either version 2.1 of the License, or
11
(at your option) any later version.
13
systemd is distributed in the hope that it will be useful, but
14
WITHOUT ANY WARRANTY; without even the implied warranty of
15
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
Lesser General Public License for more details.
18
You should have received a copy of the GNU Lesser General Public License
19
along with systemd; If not, see <http://www.gnu.org/licenses/>.
22
/* This file is based on the GLIB utf8 validation functions. The
23
* original license text follows. */
25
/* gutf8.c - Operations on UTF-8 strings.
27
* Copyright (C) 1999 Tom Tromey
28
* Copyright (C) 2000 Red Hat, Inc.
30
* This library is free software; you can redistribute it and/or
31
* modify it under the terms of the GNU Library General Public
32
* License as published by the Free Software Foundation; either
33
* version 2 of the License, or (at your option) any later version.
35
* This library is distributed in the hope that it will be useful,
36
* but WITHOUT ANY WARRANTY; without even the implied warranty of
37
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38
* Library General Public License for more details.
40
* You should have received a copy of the GNU Library General Public
41
* License along with this library; if not, write to the Free Software
42
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
54
#define FILTER_CHAR '_'
56
static inline bool is_unicode_valid(uint32_t ch) {
58
if (ch >= 0x110000) /* End of unicode space */
60
if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
62
if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
64
if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
70
static inline bool is_continuation_char(uint8_t ch) {
71
if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
76
static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
81
static bool is_unicode_control(uint32_t ch) {
84
0 to ' '-1 is the C0 range.
85
DEL=0x7F, and DEL+1 to 0x9F is C1 range.
86
'\t' is in C0 range, but more or less harmless and commonly used.
89
return (ch < ' ' && ch != '\t') ||
90
(0x7F <= ch && ch <= 0x9F);
93
char* utf8_is_printable_n(const char* str, size_t length) {
100
for (p = (const uint8_t*) str; length; p++, length--) {
104
if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
106
val = (uint32_t) (*p & 0x1e);
108
} else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
110
val = (uint32_t) (*p & 0x0f);
112
} else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
114
val = (uint32_t) (*p & 0x07);
120
if (!length || !is_continuation_char(*p))
122
merge_continuation_char(&val, *p);
127
if (!is_continuation_char(*p))
129
merge_continuation_char(&val, *p);
134
if (!is_continuation_char(*p))
136
merge_continuation_char(&val, *p);
142
if (is_unicode_control(val))
152
static char* utf8_validate(const char *str, char *output) {
155
const uint8_t *p, *last;
161
o = (uint8_t*) output;
162
for (p = (const uint8_t*) str; *p; p++) {
169
if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
172
val = (uint32_t) (*p & 0x1e);
174
} else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
177
val = (uint32_t) (*p & 0x0f);
179
} else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
182
val = (uint32_t) (*p & 0x07);
187
if (!is_continuation_char(*p))
189
merge_continuation_char(&val, *p);
193
if (!is_continuation_char(*p))
195
merge_continuation_char(&val, *p);
199
if (!is_continuation_char(*p))
201
merge_continuation_char(&val, *p);
206
if (!is_unicode_valid(val))
210
memcpy(o, last, (size_t) size);
219
p = last; /* We retry at the next character */
239
char* utf8_is_valid (const char *str) {
240
return utf8_validate(str, NULL);
243
char* utf8_filter (const char *str) {
248
new_str = malloc(strlen(str) + 1);
252
return utf8_validate(str, new_str);
255
char *ascii_is_valid(const char *str) {
260
for (p = str; *p; p++)
261
if ((unsigned char) *p >= 128)
267
char *ascii_filter(const char *str) {
279
for (s = str, d = r; *s; s++)
280
if ((unsigned char) *s < 128)
288
char *utf16_to_utf8(const void *s, size_t length) {
293
r = new(char, (length*3+1)/2 + 1);
299
for (f = s; f < (const uint8_t*) s + length; f += 2) {
302
c = (f[1] << 8) | f[0];
307
} else if (c < 0x80) {
308
*(t++) = (uint8_t) c;
309
} else if (c < 0x800) {
310
*(t++) = (uint8_t) (0xc0 | (c >> 6));
311
*(t++) = (uint8_t) (0x80 | (c & 0x3f));
313
*(t++) = (uint8_t) (0xe0 | (c >> 12));
314
*(t++) = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
315
*(t++) = (uint8_t) (0x80 | (c & 0x3f));