1
/* $OpenLDAP: pkg/ldap/libraries/liblunicode/utbm/utbm.c,v 1.7.2.4 2009/01/22 00:00:57 kurt Exp $ */
2
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
4
* Copyright 1998-2009 The OpenLDAP Foundation.
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted only as authorized by the OpenLDAP
11
* A copy of this license is available in file LICENSE in the
12
* top-level directory of the distribution or, alternatively, at
13
* <http://www.OpenLDAP.org/license.html>.
15
/* Copyright 1997, 1998, 1999 Computing Research Labs,
16
* New Mexico State University
18
* Permission is hereby granted, free of charge, to any person obtaining a
19
* copy of this software and associated documentation files (the "Software"),
20
* to deal in the Software without restriction, including without limitation
21
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
22
* and/or sell copies of the Software, and to permit persons to whom the
23
* Software is furnished to do so, subject to the following conditions:
25
* The above copyright notice and this permission notice shall be included in
26
* all copies or substantial portions of the Software.
28
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
31
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
32
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
33
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
34
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36
/* $Id: utbm.c,v 1.1 1999/09/21 15:45:17 mleisher Exp $ */
40
* 1. Case conversions of UTF-16 characters must also be UTF-16 characters.
41
* 2. Case conversions are all one-to-one.
42
* 3. Text and pattern have already been normalized in some fashion.
51
* Single pattern character.
64
typedef struct _utbm_pattern_t {
68
unsigned long pat_used;
69
unsigned long pat_size;
73
unsigned long skip_used;
74
unsigned long skip_size;
79
/*************************************************************************
83
*************************************************************************/
86
* Routine to look up the skip value for a character.
89
_utbm_skip(utbm_pattern_t p, ucs2_t *start, ucs2_t *end)
99
c2 = (start + 1 < end) ? *(start + 1) : ~0;
100
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
101
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
103
for (i = 0, sp = p->skip; i < p->skip_used; i++, sp++) {
104
if (!((c1 ^ sp->ch->uc) & (c1 ^ sp->ch->lc) & (c1 ^ sp->ch->tc))) {
105
return ((unsigned long) (end - start) < sp->skip) ?
106
end - start : sp->skip;
113
_utbm_match(utbm_pattern_t pat, ucs2_t *text, ucs2_t *start, ucs2_t *end,
114
unsigned long *match_start, unsigned long *match_end)
122
* Set the potential match endpoint first.
124
*match_end = (start - text) + 1;
127
c2 = (start + 1 < end) ? *(start + 1) : ~0;
128
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff) {
129
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
131
* Adjust the match end point to occur after the UTF-16 character.
133
*match_end = *match_end + 1;
136
if (pat->pat_used == 1) {
137
*match_start = start - text;
144
cp = pat->pat + (pat->pat_used - 1);
146
for (count = pat->patlen; start > text && count > 0;) {
148
* Ignore non-spacing characters if indicated.
150
if (pat->flags & UTBM_IGNORE_NONSPACING) {
151
while (start > text && _utbm_nonspacing(c1)) {
153
c1 = (start - 1 > text) ? *(start - 1) : ~0;
154
if (0xdc00 <= c2 && c2 <= 0xdfff &&
155
0xd800 <= c1 && c1 <= 0xdbff) {
156
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
164
* Handle space compression if indicated.
166
if (pat->flags & UTBM_SPACE_COMPRESS) {
168
while (start > text &&
169
(_utbm_isspace(c1, 1) || _utbm_iscntrl(c1))) {
170
check_space = _utbm_isspace(c1, 1);
172
c1 = (start - 1 > text) ? *(start - 1) : ~0;
173
if (0xdc00 <= c2 && c2 <= 0xdfff &&
174
0xd800 <= c1 && c1 <= 0xdbff) {
175
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
181
* Handle things if space compression was indicated and one or
182
* more member characters were found.
193
* Handle the normal comparison cases.
195
if (count > 0 && ((c1 ^ cp->uc) & (c1 ^ cp->lc) & (c1 ^ cp->tc)))
198
count -= (c1 >= 0x10000) ? 2 : 1;
203
* Get the next preceding character.
207
c1 = (start - 1 > text) ? *(start - 1) : ~0;
208
if (0xdc00 <= c2 && c2 <= 0xdfff &&
209
0xd800 <= c1 && c1 <= 0xdbff) {
210
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
219
* Set the match start position.
221
*match_start = start - text;
225
/*************************************************************************
229
*************************************************************************/
232
utbm_create_pattern(void)
236
p = (utbm_pattern_t) malloc(sizeof(_utbm_pattern_t));
237
(void) memset((char *) p, '\0', sizeof(_utbm_pattern_t));
242
utbm_free_pattern(utbm_pattern_t pattern)
247
if (pattern->pat_size > 0)
248
free((char *) pattern->pat);
250
if (pattern->skip_size > 0)
251
free((char *) pattern->skip);
253
free((char *) pattern);
257
utbm_compile(ucs2_t *pat, unsigned long patlen, unsigned long flags,
261
unsigned long i, j, k, slen;
264
ucs4_t c1, c2, sentinel;
266
if (p == 0 || pat == 0 || *pat == 0 || patlen == 0)
270
* Reset the pattern buffer.
272
p->patlen = p->pat_used = p->skip_used = 0;
280
* Initialize the extra skip flag.
285
* Allocate more storage if necessary.
287
if (patlen > p->pat_size) {
288
if (p->pat_size == 0) {
289
p->pat = (_utbm_char_t *) malloc(sizeof(_utbm_char_t) * patlen);
290
p->skip = (_utbm_skip_t *) malloc(sizeof(_utbm_skip_t) * patlen);
292
p->pat = (_utbm_char_t *)
293
realloc((char *) p->pat, sizeof(_utbm_char_t) * patlen);
294
p->skip = (_utbm_skip_t *)
295
realloc((char *) p->skip, sizeof(_utbm_skip_t) * patlen);
297
p->pat_size = p->skip_size = patlen;
301
* Preprocess the pattern to remove controls (if specified) and determine
304
for (have_space = 0, cp = p->pat, i = 0; i < patlen; i++) {
306
c2 = (i + 1 < patlen) ? pat[i + 1] : ~0;
307
if (0xd800 <= c1 && c1 <= 0xdbff && 0xdc00 <= c2 && c2 <= 0xdfff)
308
c1 = 0x10000 + (((c1 & 0x03ff) << 10) | (c2 & 0x03ff));
311
* Make sure the `have_space' flag is turned off if the character
312
* is not an appropriate one.
314
if (!_utbm_isspace(c1, flags & UTBM_SPACE_COMPRESS))
318
* If non-spacing characters should be ignored, do it here.
320
if ((flags & UTBM_IGNORE_NONSPACING) && _utbm_nonspacing(c1))
324
* Check if spaces and controls need to be compressed.
326
if (flags & UTBM_SPACE_COMPRESS) {
327
if (_utbm_isspace(c1, 1)) {
330
* Add a space and set the flag.
332
cp->uc = cp->lc = cp->tc = ' ';
336
* Increase the real pattern length.
346
* Ignore all control characters.
348
if (_utbm_iscntrl(c1))
355
if (flags & UTBM_CASEFOLD) {
356
cp->uc = _utbm_toupper(c1);
357
cp->lc = _utbm_tolower(c1);
358
cp->tc = _utbm_totitle(c1);
360
cp->uc = cp->lc = cp->tc = c1;
363
* Set the sentinel character.
368
* Move to the next character.
373
* Increase the real pattern length appropriately.
375
p->patlen += (c1 >= 0x10000) ? 2 : 1;
378
* Increment the loop index for UTF-16 characters.
380
i += (c1 >= 0x10000) ? 1 : 0;
385
* Set the number of characters actually used.
387
p->pat_used = cp - p->pat;
390
* Go through and construct the skip array and determine the actual length
391
* of the pattern in UCS2 terms.
393
slen = p->patlen - 1;
395
for (i = k = 0; i < p->pat_used; i++, cp++) {
397
* Locate the character in the skip array.
399
for (sp = p->skip, j = 0;
400
j < p->skip_used && sp->ch->uc != cp->uc; j++, sp++) ;
403
* If the character is not found, set the new skip element and
404
* increase the number of skip elements.
406
if (j == p->skip_used) {
412
* Set the updated skip value. If the character is UTF-16 and is
413
* not the last one in the pattern, add one to its skip value.
416
if (cp->uc >= 0x10000 && k + 2 < slen)
420
* Set the new extra skip for the sentinel character.
422
if (((cp->uc >= 0x10000 && k + 2 <= slen) || k + 1 <= slen) &&
427
* Increase the actual index.
429
k += (cp->uc >= 0x10000) ? 2 : 1;
434
utbm_exec(utbm_pattern_t pat, ucs2_t *text, unsigned long textlen,
435
unsigned long *match_start, unsigned long *match_end)
440
if (pat == 0 || pat->pat_used == 0 || text == 0 || textlen == 0 ||
441
textlen < pat->patlen)
444
start = text + pat->patlen;
445
end = text + textlen;
448
* Adjust the start point if it points to a low surrogate.
450
if (0xdc00 <= *start && *start <= 0xdfff &&
451
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
454
while (start < end) {
455
while ((k = _utbm_skip(pat, start, end))) {
457
if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
458
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)
463
_utbm_match(pat, text, start, end, match_start, match_end))
467
if (start < end && 0xdc00 <= *start && *start <= 0xdfff &&
468
0xd800 <= *(start - 1) && *(start - 1) <= 0xdbff)