2
* utf_validate.c: Validate a UTF-8 string
4
* ====================================================================
5
* Copyright (c) 2004 CollabNet. All rights reserved.
7
* This software is licensed as described in the file COPYING, which
8
* you should have received as part of this distribution. The terms
9
* are also available at http://subversion.tigris.org/license-1.html.
10
* If newer versions of this license are posted there, you may use a
11
* newer version instead, at your option.
13
* This software consists of voluntary contributions made by many
14
* individuals. For exact contribution history, see the revision
15
* history and logs, available at http://subversion.tigris.org/.
16
* ====================================================================
19
/* Validate a UTF-8 string according to the rules in
21
* Table 3-6. Well-Formed UTF-8 Bytes Sequences
25
* The Unicode Standard, Version 4.0
27
* which is available at
29
* http://www.unicode.org/
31
* UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
32
* is a subset of that enconding. The Unicode enconding prohibits things
33
* like non-shortest encodings (some characters can be represented by more
34
* than one multi-byte encoding) and the encodings for the surrogate code
35
* points. RFC-3629 superceeds RFC-2279 and adopts the same well-formed
36
* rules as Unicode. This is the ABNF in RFC-3629 that describes
37
* well-formed UTF-8 rules:
39
* UTF8-octets = *( UTF8-char )
40
* UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
42
* UTF8-2 = %xC2-DF UTF8-tail
43
* UTF8-3 = %xE0 %xA0-BF UTF8-tail /
44
* %xE1-EC 2( UTF8-tail ) /
45
* %xED %x80-9F UTF8-tail /
46
* %xEE-EF 2( UTF8-tail )
47
* UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
48
* %xF1-F3 3( UTF8-tail ) /
49
* %xF4 %x80-8F 2( UTF8-tail )
56
/* Lookup table to categorise each octet in the string. */
57
static const char octet_category[256] = {
58
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00-0x7f */
59
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8f */
67
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 0x90-0x9f */
68
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 0xa0-0xbf */
69
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
71
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* 0xc2-0xdf */
72
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
74
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, /* 0xe1-0xec */
78
11, 11, 11, /* 0xf1-0xf3 */
80
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
87
#define FSM_80BF80BF 3
90
#define FSM_80BF80BF80BF 6
94
/* In the FSM it appears that categories 0xc0-0xc1 and 0xf5-0xff make the
95
same transitions, as do categories 0xe1-0xec and 0xee-0xef. I wonder if
96
there is any great benefit in combining categories? It would reduce the
97
memory footprint of the transition table by 16 bytes, but might it be
98
harder to understand? */
100
/* Machine transition table */
101
static const char machine [9][14] = {
103
{FSM_START, /* 0x00-0x7f */
104
FSM_ERROR, /* 0x80-0x8f */
105
FSM_ERROR, /* 0x90-0x9f */
106
FSM_ERROR, /* 0xa0-0xbf */
107
FSM_ERROR, /* 0xc0-0xc1 */
108
FSM_80BF, /* 0xc2-0xdf */
110
FSM_80BF80BF, /* 0xe1-0xec */
112
FSM_80BF80BF, /* 0xee-0xef */
114
FSM_80BF80BF80BF, /* 0xf1-0xf3 */
116
FSM_ERROR}, /* 0xf5-0xff */
119
{FSM_ERROR, /* 0x00-0x7f */
120
FSM_START, /* 0x80-0x8f */
121
FSM_START, /* 0x90-0x9f */
122
FSM_START, /* 0xa0-0xbf */
123
FSM_ERROR, /* 0xc0-0xc1 */
124
FSM_ERROR, /* 0xc2-0xdf */
125
FSM_ERROR, /* 0xe0 */
126
FSM_ERROR, /* 0xe1-0xec */
127
FSM_ERROR, /* 0xed */
128
FSM_ERROR, /* 0xee-0xef */
129
FSM_ERROR, /* 0xf0 */
130
FSM_ERROR, /* 0xf1-0xf3 */
131
FSM_ERROR, /* 0xf4 */
132
FSM_ERROR}, /* 0xf5-0xff */
135
{FSM_ERROR, /* 0x00-0x7f */
136
FSM_ERROR, /* 0x80-0x8f */
137
FSM_ERROR, /* 0x90-0x9f */
138
FSM_80BF, /* 0xa0-0xbf */
139
FSM_ERROR, /* 0xc0-0xc1 */
140
FSM_ERROR, /* 0xc2-0xdf */
141
FSM_ERROR, /* 0xe0 */
142
FSM_ERROR, /* 0xe1-0xec */
143
FSM_ERROR, /* 0xed */
144
FSM_ERROR, /* 0xee-0xef */
145
FSM_ERROR, /* 0xf0 */
146
FSM_ERROR, /* 0xf1-0xf3 */
147
FSM_ERROR, /* 0xf4 */
148
FSM_ERROR}, /* 0xf5-0xff */
151
{FSM_ERROR, /* 0x00-0x7f */
152
FSM_80BF, /* 0x80-0x8f */
153
FSM_80BF, /* 0x90-0x9f */
154
FSM_80BF, /* 0xa0-0xbf */
155
FSM_ERROR, /* 0xc0-0xc1 */
156
FSM_ERROR, /* 0xc2-0xdf */
157
FSM_ERROR, /* 0xe0 */
158
FSM_ERROR, /* 0xe1-0xec */
159
FSM_ERROR, /* 0xed */
160
FSM_ERROR, /* 0xee-0xef */
161
FSM_ERROR, /* 0xf0 */
162
FSM_ERROR, /* 0xf1-0xf3 */
163
FSM_ERROR, /* 0xf4 */
164
FSM_ERROR}, /* 0xf5-0xff */
167
{FSM_ERROR, /* 0x00-0x7f */
168
FSM_80BF, /* 0x80-0x8f */
169
FSM_80BF, /* 0x90-0x9f */
170
FSM_ERROR, /* 0xa0-0xbf */
171
FSM_ERROR, /* 0xc0-0xc1 */
172
FSM_ERROR, /* 0xc2-0xdf */
173
FSM_ERROR, /* 0xe0 */
174
FSM_ERROR, /* 0xe1-0xec */
175
FSM_ERROR, /* 0xed */
176
FSM_ERROR, /* 0xee-0xef */
177
FSM_ERROR, /* 0xf0 */
178
FSM_ERROR, /* 0xf1-0xf3 */
179
FSM_ERROR, /* 0xf4 */
180
FSM_ERROR}, /* 0xf5-0xff */
183
{FSM_ERROR, /* 0x00-0x7f */
184
FSM_ERROR, /* 0x80-0x8f */
185
FSM_80BF80BF, /* 0x90-0x9f */
186
FSM_80BF80BF, /* 0xa0-0xbf */
187
FSM_ERROR, /* 0xc0-0xc1 */
188
FSM_ERROR, /* 0xc2-0xdf */
189
FSM_ERROR, /* 0xe0 */
190
FSM_ERROR, /* 0xe1-0xec */
191
FSM_ERROR, /* 0xed */
192
FSM_ERROR, /* 0xee-0xef */
193
FSM_ERROR, /* 0xf0 */
194
FSM_ERROR, /* 0xf1-0xf3 */
195
FSM_ERROR, /* 0xf4 */
196
FSM_ERROR}, /* 0xf5-0xff */
198
/* FSM_80BF80BF80BF */
199
{FSM_ERROR, /* 0x00-0x7f */
200
FSM_80BF80BF, /* 0x80-0x8f */
201
FSM_80BF80BF, /* 0x90-0x9f */
202
FSM_80BF80BF, /* 0xa0-0xbf */
203
FSM_ERROR, /* 0xc0-0xc1 */
204
FSM_ERROR, /* 0xc2-0xdf */
205
FSM_ERROR, /* 0xe0 */
206
FSM_ERROR, /* 0xe1-0xec */
207
FSM_ERROR, /* 0xed */
208
FSM_ERROR, /* 0xee-0xef */
209
FSM_ERROR, /* 0xf0 */
210
FSM_ERROR, /* 0xf1-0xf3 */
211
FSM_ERROR, /* 0xf4 */
212
FSM_ERROR}, /* 0xf5-0xff */
215
{FSM_ERROR, /* 0x00-0x7f */
216
FSM_80BF80BF, /* 0x80-0x8f */
217
FSM_ERROR, /* 0x90-0x9f */
218
FSM_ERROR, /* 0xa0-0xbf */
219
FSM_ERROR, /* 0xc0-0xc1 */
220
FSM_ERROR, /* 0xc2-0xdf */
221
FSM_ERROR, /* 0xe0 */
222
FSM_ERROR, /* 0xe1-0xec */
223
FSM_ERROR, /* 0xed */
224
FSM_ERROR, /* 0xee-0xef */
225
FSM_ERROR, /* 0xf0 */
226
FSM_ERROR, /* 0xf1-0xf3 */
227
FSM_ERROR, /* 0xf4 */
228
FSM_ERROR}, /* 0xf5-0xff */
231
{FSM_ERROR, /* 0x00-0x7f */
232
FSM_ERROR, /* 0x80-0x8f */
233
FSM_ERROR, /* 0x90-0x9f */
234
FSM_ERROR, /* 0xa0-0xbf */
235
FSM_ERROR, /* 0xc0-0xc1 */
236
FSM_ERROR, /* 0xc2-0xdf */
237
FSM_ERROR, /* 0xe0 */
238
FSM_ERROR, /* 0xe1-0xec */
239
FSM_ERROR, /* 0xed */
240
FSM_ERROR, /* 0xee-0xef */
241
FSM_ERROR, /* 0xf0 */
242
FSM_ERROR, /* 0xf1-0xf3 */
243
FSM_ERROR, /* 0xf4 */
244
FSM_ERROR}, /* 0xf5-0xff */
249
svn_utf__last_valid(const char *data, apr_size_t len)
251
const char *start = data, *end = data + len;
252
int state = FSM_START;
255
unsigned char octet = *data++;
256
int category = octet_category[octet];
257
state = machine[state][category];
258
if (state == FSM_START)
265
svn_utf__cstring_is_valid(const char *data)
267
int state = FSM_START;
270
unsigned char octet = *data++;
271
int category = octet_category[octet];
272
state = machine[state][category];
274
return state == FSM_START ? TRUE : FALSE;
278
svn_utf__is_valid(const char *data, apr_size_t len)
280
const char *end = data + len;
281
int state = FSM_START;
284
unsigned char octet = *data++;
285
int category = octet_category[octet];
286
state = machine[state][category];
288
return state == FSM_START ? TRUE : FALSE;
292
svn_utf__last_valid2(const char *data, apr_size_t len)
294
const char *start = data, *end = data + len;
295
int state = FSM_START;
298
unsigned char octet = *data++;
304
else if (octet <= 0xC1)
306
else if (octet <= 0xDF)
308
else if (octet == 0xE0)
310
else if (octet <= 0xEC)
311
state = FSM_80BF80BF;
312
else if (octet == 0xED)
314
else if (octet <= 0xEF)
315
state = FSM_80BF80BF;
316
else if (octet == 0xF0)
318
else if (octet <= 0xF3)
319
state = FSM_80BF80BF80BF;
320
else if (octet <= 0xF4)
326
if (octet >= 0x80 && octet <= 0xBF)
332
if (octet >= 0xA0 && octet <= 0xBF)
338
if (octet >= 0x80 && octet <= 0xBF)
344
if (octet >= 0x80 && octet <= 0x9F)
350
if (octet >= 0x90 && octet <= 0xBF)
351
state = FSM_80BF80BF;
355
case FSM_80BF80BF80BF:
356
if (octet >= 0x80 && octet <= 0xBF)
357
state = FSM_80BF80BF;
362
if (octet >= 0x80 && octet <= 0x8F)
363
state = FSM_80BF80BF;
371
if (state == FSM_START)