2
* utf8proc.c: Wrappers for the utf8proc library
4
* ====================================================================
5
* Licensed to the Apache Software Foundation (ASF) under one
6
* or more contributor license agreements. See the NOTICE file
7
* distributed with this work for additional information
8
* regarding copyright ownership. The ASF licenses this file
9
* to you under the Apache License, Version 2.0 (the
10
* "License"); you may not use this file except in compliance
11
* with the License. You may obtain a copy of the License at
13
* http://www.apache.org/licenses/LICENSE-2.0
15
* Unless required by applicable law or agreed to in writing,
16
* software distributed under the License is distributed on an
17
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18
* KIND, either express or implied. See the License for the
19
* specific language governing permissions and limitations
21
* ====================================================================
26
#include <apr_fnmatch.h>
28
#include "private/svn_string_private.h"
29
#include "private/svn_utf_private.h"
30
#include "svn_private_config.h"
32
#define UTF8PROC_INLINE
33
/* Somehow utf8proc thinks it is nice to use strlen as an argument name,
34
while this function is already defined via apr.h */
35
#define strlen svn__strlen_var
36
#include "utf8proc/utf8proc.c"
40
const char *svn_utf__utf8proc_version(void)
42
/* Unused static function warning removal hack. */
43
SVN_UNUSED(utf8proc_NFD);
44
SVN_UNUSED(utf8proc_NFC);
45
SVN_UNUSED(utf8proc_NFKD);
46
SVN_UNUSED(utf8proc_NFKC);
48
return utf8proc_version();
53
/* Fill the given BUFFER with decomposed UCS-4 representation of the
54
* UTF-8 STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING
55
* is NUL-terminated; otherwise look only at the first LENGTH bytes in
56
* STRING. Upon return, BUFFER->data points at an array of UCS-4
57
* characters, and return the length of the array. TRANSFORM_FLAGS
58
* define exactly how the decomposition is performed.
60
* A negative return value is an utf8proc error code and may indicate
61
* that STRING contains invalid UTF-8 or was so long that an overflow
65
unicode_decomposition(int transform_flags,
66
const char *string, apr_size_t length,
69
const int nullterm = (length == SVN_UTF__UNKNOWN_LENGTH
70
? UTF8PROC_NULLTERM : 0);
74
apr_int32_t *const ucs4buf = buffer->data;
75
const ssize_t ucs4len = buffer->size / sizeof(*ucs4buf);
76
const ssize_t result =
77
utf8proc_decompose((const void*) string, length, ucs4buf, ucs4len,
78
UTF8PROC_DECOMPOSE | UTF8PROC_STABLE
79
| transform_flags | nullterm);
81
if (result < 0 || result <= ucs4len)
84
/* Increase the decomposition buffer size and retry */
85
svn_membuf__ensure(buffer, result * sizeof(*ucs4buf));
89
/* Fill the given BUFFER with an NFD UCS-4 representation of the UTF-8
90
* STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
91
* NUL-terminated; otherwise look only at the first LENGTH bytes in
92
* STRING. Upon return, BUFFER->data points at an array of UCS-4
93
* characters and *RESULT_LENGTH contains the length of the array.
95
* A returned error may indicate that STRING contains invalid UTF-8 or
96
* invalid Unicode codepoints. Any error message comes from utf8proc.
99
decompose_normalized(apr_size_t *result_length,
100
const char *string, apr_size_t length,
101
svn_membuf_t *buffer)
103
ssize_t result = unicode_decomposition(0, string, length, buffer);
105
return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
106
gettext(utf8proc_errmsg(result)));
107
*result_length = result;
111
/* Fill the given BUFFER with an NFC UTF-8 representation of the UTF-8
112
* STRING. If LENGTH is SVN_UTF__UNKNOWN_LENGTH, assume STRING is
113
* NUL-terminated; otherwise look only at the first LENGTH bytes in
114
* STRING. Upon return, BUFFER->data points at a NUL-terminated string
115
* of UTF-8 characters.
117
* A returned error may indicate that STRING contains invalid UTF-8 or
118
* invalid Unicode codepoints. Any error message comes from utf8proc.
121
normalize_cstring(apr_size_t *result_length,
122
const char *string, apr_size_t length,
123
svn_membuf_t *buffer)
125
ssize_t result = unicode_decomposition(0, string, length, buffer);
128
svn_membuf__resize(buffer, result * sizeof(apr_int32_t) + 1);
129
result = utf8proc_reencode(buffer->data, result,
130
UTF8PROC_COMPOSE | UTF8PROC_STABLE);
133
return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
134
gettext(utf8proc_errmsg(result)));
135
*result_length = result;
139
/* Compare two arrays of UCS-4 codes, BUFA of length LENA and BUFB of
140
* length LENB. Return 0 if they're equal, a negative value if BUFA is
141
* less than BUFB, otherwise a positive value.
143
* Yes, this is strcmp for known-length UCS-4 strings.
146
ucs4cmp(const apr_int32_t *bufa, apr_size_t lena,
147
const apr_int32_t *bufb, apr_size_t lenb)
149
const apr_size_t len = (lena < lenb ? lena : lenb);
152
for (i = 0; i < len; ++i)
154
const int diff = bufa[i] - bufb[i];
158
return (lena == lenb ? 0 : (lena < lenb ? -1 : 1));
162
svn_utf__normcmp(int *result,
163
const char *str1, apr_size_t len1,
164
const char *str2, apr_size_t len2,
165
svn_membuf_t *buf1, svn_membuf_t *buf2)
170
/* Shortcut-circuit the decision if at least one of the strings is empty. */
171
const svn_boolean_t empty1 =
172
(0 == len1 || (len1 == SVN_UTF__UNKNOWN_LENGTH && !*str1));
173
const svn_boolean_t empty2 =
174
(0 == len2 || (len2 == SVN_UTF__UNKNOWN_LENGTH && !*str2));
175
if (empty1 || empty2)
177
*result = (empty1 == empty2 ? 0 : (empty1 ? -1 : 1));
181
SVN_ERR(decompose_normalized(&buflen1, str1, len1, buf1));
182
SVN_ERR(decompose_normalized(&buflen2, str2, len2, buf2));
183
*result = ucs4cmp(buf1->data, buflen1, buf2->data, buflen2);
188
svn_utf__normalize(const char **result,
189
const char *str, apr_size_t len,
192
apr_size_t result_length;
193
SVN_ERR(normalize_cstring(&result_length, str, len, buf));
194
*result = (const char*)(buf->data);
198
/* Decode a single UCS-4 code point to UTF-8, appending the result to BUFFER.
199
* Assume BUFFER is already filled to *LENGTH and return the new size there.
200
* This function does *not* nul-terminate the stringbuf!
202
* A returned error indicates that the codepoint is invalid.
205
encode_ucs4(svn_membuf_t *buffer, apr_int32_t ucs4chr, apr_size_t *length)
209
if (buffer->size - *length < 4)
210
svn_membuf__resize(buffer, buffer->size + 4);
212
utf8len = utf8proc_encode_char(ucs4chr, ((uint8_t*)buffer->data + *length));
214
return svn_error_createf(SVN_ERR_UTF8PROC_ERROR, NULL,
215
_("Invalid Unicode character U+%04lX"),
222
svn_utf__encode_ucs4_string(svn_membuf_t *buffer,
223
const apr_int32_t *ucs4str,
225
apr_size_t *result_length)
229
SVN_ERR(encode_ucs4(buffer, *ucs4str++, result_length));
230
svn_membuf__resize(buffer, *result_length + 1);
231
((char*)buffer->data)[*result_length] = '\0';
237
svn_utf__glob(svn_boolean_t *match,
238
const char *pattern, apr_size_t pattern_len,
239
const char *string, apr_size_t string_len,
240
const char *escape, apr_size_t escape_len,
241
svn_boolean_t sql_like,
242
svn_membuf_t *pattern_buf,
243
svn_membuf_t *string_buf,
244
svn_membuf_t *temp_buf)
246
apr_size_t patternbuf_len;
247
apr_size_t tempbuf_len;
249
/* If we're in GLOB mode, we don't do custom escape chars. */
250
if (escape && !sql_like)
251
return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
252
_("Cannot use a custom escape token"
253
" in glob matching mode"));
255
/* Convert the patern to NFD UTF-8. We can't use the UCS-4 result
256
because apr_fnmatch can't handle it.*/
257
SVN_ERR(decompose_normalized(&tempbuf_len, pattern, pattern_len, temp_buf));
259
SVN_ERR(svn_utf__encode_ucs4_string(pattern_buf, temp_buf->data,
260
tempbuf_len, &patternbuf_len));
263
/* Convert a LIKE pattern to a GLOB pattern that apr_fnmatch can use. */
264
const apr_int32_t *like = temp_buf->data;
266
svn_boolean_t escaped;
270
ucs4esc = -1; /* Definitely an invalid UCS-4 character. */
273
const int nullterm = (escape_len == SVN_UTF__UNKNOWN_LENGTH
274
? UTF8PROC_NULLTERM : 0);
276
utf8proc_decompose((const void*) escape, escape_len, &ucs4esc, 1,
277
UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | nullterm);
279
return svn_error_create(SVN_ERR_UTF8PROC_ERROR, NULL,
280
gettext(utf8proc_errmsg(result)));
281
if (result == 0 || result > 1)
282
return svn_error_create(SVN_ERR_UTF8_GLOB, NULL,
283
_("Escape token must be one character"));
284
if ((ucs4esc & 0xFF) != ucs4esc)
285
return svn_error_createf(SVN_ERR_UTF8_GLOB, NULL,
286
_("Invalid escape character U+%04lX"),
291
svn_membuf__ensure(pattern_buf, tempbuf_len + 1);
292
for (i = 0, escaped = FALSE; i < tempbuf_len; ++i, ++like)
294
if (*like == ucs4esc && !escaped)
296
svn_membuf__resize(pattern_buf, patternbuf_len + 1);
297
((char*)pattern_buf->data)[patternbuf_len++] = '\\';
302
SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
307
if ((*like == '[' || *like == '\\') && !escaped)
309
/* Escape brackets and backslashes which are always
310
literals in LIKE patterns. */
311
svn_membuf__resize(pattern_buf, patternbuf_len + 1);
312
((char*)pattern_buf->data)[patternbuf_len++] = '\\';
318
/* Replace LIKE wildcards with their GLOB equivalents. */
319
if (*like == '%' || *like == '_')
321
const char wildcard = (*like == '%' ? '*' : '?');
322
svn_membuf__resize(pattern_buf, patternbuf_len + 1);
323
((char*)pattern_buf->data)[patternbuf_len++] = wildcard;
326
SVN_ERR(encode_ucs4(pattern_buf, *like, &patternbuf_len));
329
svn_membuf__resize(pattern_buf, patternbuf_len + 1);
330
((char*)pattern_buf->data)[patternbuf_len] = '\0';
333
/* Now normalize the string */
334
SVN_ERR(decompose_normalized(&tempbuf_len, string, string_len, temp_buf));
335
SVN_ERR(svn_utf__encode_ucs4_string(string_buf, temp_buf->data,
336
tempbuf_len, &tempbuf_len));
338
*match = !apr_fnmatch(pattern_buf->data, string_buf->data, 0);
343
svn_utf__is_normalized(const char *string, apr_pool_t *scratch_pool)
347
apr_size_t result_length;
348
const apr_size_t length = strlen(string);
349
svn_membuf__create(&buffer, length * sizeof(apr_int32_t), scratch_pool);
350
err = normalize_cstring(&result_length, string, length, &buffer);
353
svn_error_clear(err);
356
return (length == result_length && 0 == strcmp(string, buffer.data));
360
svn_utf__fuzzy_escape(const char *src, apr_size_t length, apr_pool_t *pool)
362
/* Hexadecimal digits for code conversion. */
363
static const char digits[] = "0123456789ABCDEF";
365
/* Flags used for Unicode decomposition. */
366
static const int decomp_flags = (
367
UTF8PROC_COMPAT | UTF8PROC_STABLE | UTF8PROC_LUMP
368
| UTF8PROC_NLF2LF | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK);
370
svn_stringbuf_t *result;
372
ssize_t decomp_length;
375
/* Decompose to a non-reversible compatibility format. */
376
svn_membuf__create(&buffer, length * sizeof(apr_int32_t), pool);
377
decomp_length = unicode_decomposition(decomp_flags, src, length, &buffer);
378
if (decomp_length < 0)
381
apr_size_t done, prev;
383
/* The only other error we can receive here indicates an integer
384
overflow due to the length of the input string. Not very
385
likely, but we certainly shouldn't continue in that case. */
386
SVN_ERR_ASSERT_NO_RETURN(decomp_length == UTF8PROC_ERROR_INVALIDUTF8);
388
/* Break the decomposition into parts that are valid UTF-8, and
389
bytes that are not. Represent the invalid bytes in the target
390
erray by their negative value. This works because utf8proc
391
will not generate Unicode code points with values larger than
393
svn_membuf__create(&part, sizeof(apr_int32_t), pool);
396
while (done < length)
400
while (done < length)
402
len = utf8proc_iterate((uint8_t*)src + done, length - done, &uc);
408
/* Decompose the valid part */
411
len = unicode_decomposition(
412
decomp_flags, src + prev, done - prev, &part);
413
SVN_ERR_ASSERT_NO_RETURN(len > 0);
415
&buffer, (decomp_length + len) * sizeof(apr_int32_t));
416
memcpy((apr_int32_t*)buffer.data + decomp_length,
417
part.data, len * sizeof(apr_int32_t));
418
decomp_length += len;
422
/* What follows could be a valid UTF-8 sequence, but not
423
a valid Unicode character. */
428
/* Determine the length of the UTF-8 sequence */
429
const char *const p = src + done;
430
len = utf8proc_utf8class[(uint8_t)*p];
432
/* Check if the multi-byte sequence is valid UTF-8. */
433
if (len > 1 && len <= (apr_ssize_t)(length - done))
434
last = svn_utf__last_valid(p, len);
438
/* Might not be a valid UTF-8 sequence at all */
439
if (!last || (last && last - p < len))
441
uc = -((apr_int32_t)(*p & 0xff));
448
/* Decode the UTF-8 sequence without validation. */
450
uc = ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
453
uc = (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
457
uc = (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
458
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
461
SVN_ERR_ASSERT_NO_RETURN(
462
!"Unexpected invalid UTF-8 byte");
468
&buffer, (decomp_length + 1) * sizeof(apr_int32_t));
469
((apr_int32_t*)buffer.data)[decomp_length++] = uc;
476
/* Scan the result and deleting any combining diacriticals and
477
inserting placeholders where any non-ascii characters remain. */
478
result = svn_stringbuf_create_ensure(decomp_length, pool);
479
for (len = 0; len < decomp_length; ++len)
481
const apr_int32_t cp = ((apr_int32_t*)buffer.data)[len];
482
if (cp > 0 && cp < 127)
483
svn_stringbuf_appendbyte(result, (char)cp);
485
svn_stringbuf_appendcstr(result, "\\0");
488
const apr_int32_t rcp = ((-cp) & 0xff);
489
svn_stringbuf_appendcstr(result, "?\\");
490
svn_stringbuf_appendbyte(result, digits[(rcp & 0x00f0) >> 4]);
491
svn_stringbuf_appendbyte(result, digits[(rcp & 0x000f)]);
495
if (utf8proc_codepoint_valid(cp))
497
const utf8proc_property_t *prop = utf8proc_get_property(cp);
498
if (prop->combining_class != 0)
499
continue; /* Combining mark; ignore */
500
svn_stringbuf_appendcstr(result, "{U+");
503
svn_stringbuf_appendcstr(result, "{U?");
506
svn_stringbuf_appendbyte(result, digits[(cp & 0xf00000) >> 20]);
507
svn_stringbuf_appendbyte(result, digits[(cp & 0x0f0000) >> 16]);
509
svn_stringbuf_appendbyte(result, digits[(cp & 0xf000) >> 12]);
510
svn_stringbuf_appendbyte(result, digits[(cp & 0x0f00) >> 8]);
511
svn_stringbuf_appendbyte(result, digits[(cp & 0x00f0) >> 4]);
512
svn_stringbuf_appendbyte(result, digits[(cp & 0x000f)]);
513
svn_stringbuf_appendbyte(result, '}');