1
/**********************************************************************
2
regenc.c - Oniguruma (regular expression library)
3
**********************************************************************/
5
* Copyright (c) 2002-2005 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8
* Redistribution and use in source and binary forms, with or without
9
* modification, are permitted provided that the following conditions
11
* 1. Redistributions of source code must retain the above copyright
12
* notice, this list of conditions and the following disclaimer.
13
* 2. Redistributions in binary form must reproduce the above copyright
14
* notice, this list of conditions and the following disclaimer in the
15
* documentation and/or other materials provided with the distribution.
17
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32
OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT;
41
onigenc_get_default_encoding()
43
return OnigEncDefaultCharEncoding;
47
onigenc_set_default_encoding(OnigEncoding enc)
49
OnigEncDefaultCharEncoding = enc;
54
onigenc_get_right_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
56
UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
64
onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc,
65
const UChar* start, const UChar* s, const UChar** prev)
67
UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
70
if (prev) *prev = (const UChar* )p;
74
if (prev) *prev = (const UChar* )NULL; /* Sorry */
80
onigenc_get_prev_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
85
return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
89
onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
91
while (ONIG_IS_NOT_NULL(s) && n-- > 0) {
95
s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1);
101
onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
103
UChar* q = (UChar* )p;
105
q += ONIGENC_MBC_ENC_LEN(enc, q);
107
return (q <= end ? q : NULL);
111
onigenc_strlen(OnigEncoding enc, const UChar* p, const UChar* end)
114
UChar* q = (UChar* )p;
117
q += ONIGENC_MBC_ENC_LEN(enc, q);
124
onigenc_strlen_null(OnigEncoding enc, const UChar* s)
127
UChar* p = (UChar* )s;
132
int len = ONIGENC_MBC_MINLEN(enc);
134
if (len == 1) return n;
137
if (*q != '\0') break;
141
if (len == 1) return n;
143
p += ONIGENC_MBC_ENC_LEN(enc, p);
149
onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)
151
UChar* start = (UChar* )s;
152
UChar* p = (UChar* )s;
157
int len = ONIGENC_MBC_MINLEN(enc);
159
if (len == 1) return (int )(p - start);
162
if (*q != '\0') break;
166
if (len == 1) return (int )(p - start);
168
p += ONIGENC_MBC_ENC_LEN(enc, p);
172
#ifndef ONIG_RUBY_M17N
176
#define USE_APPLICATION_TO_LOWER_CASE_TABLE
178
unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = {
179
0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
180
0x2008, 0x228c, 0x2289, 0x2288, 0x2288, 0x2288, 0x2008, 0x2008,
181
0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
182
0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
183
0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
184
0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
185
0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0,
186
0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
187
0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2,
188
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
189
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
190
0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0,
191
0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2,
192
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
193
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
194
0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008,
195
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0288, 0x0008, 0x0008,
196
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
197
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
198
0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008,
199
0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0,
200
0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x00a8, 0x00a0, 0x00a0,
201
0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0,
202
0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0,
203
0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2,
204
0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2,
205
0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0,
206
0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2,
207
0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2,
208
0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2,
209
0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0,
210
0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2
214
const UChar* OnigEncAsciiToLowerCaseTable = (const UChar* )0;
216
#ifndef USE_APPLICATION_TO_LOWER_CASE_TABLE
217
static const UChar BuiltInAsciiToLowerCaseTable[] = {
218
'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
219
'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
220
'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
221
'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
222
'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
223
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
224
'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
225
'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
226
'\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
227
'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
228
'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
229
'\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
230
'\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
231
'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
232
'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
233
'\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
234
'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
235
'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
236
'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
237
'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
238
'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
239
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
240
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
241
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
242
'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
243
'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
244
'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
245
'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
246
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
247
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
248
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
249
'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
251
#endif /* not USE_APPLICATION_TO_LOWER_CASE_TABLE */
253
#ifdef USE_UPPER_CASE_TABLE
254
UChar OnigEncAsciiToUpperCaseTable[256] = {
255
'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
256
'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
257
'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
258
'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
259
'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
260
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
261
'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
262
'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
263
'\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
264
'\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
265
'\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
266
'\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
267
'\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
268
'\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
269
'\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
270
'\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
271
'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
272
'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
273
'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
274
'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
275
'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
276
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
277
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
278
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
279
'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
280
'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
281
'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
282
'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
283
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
284
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
285
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
286
'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
290
unsigned short OnigEncAsciiCtypeTable[256] = {
291
0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
292
0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008,
293
0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
294
0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008,
295
0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
296
0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
297
0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0,
298
0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0,
299
0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2,
300
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
301
0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2,
302
0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0,
303
0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2,
304
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
305
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2,
306
0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008,
308
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
309
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
310
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
311
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
312
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
313
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
314
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
315
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
316
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
317
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
318
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
319
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
320
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
321
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
322
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
323
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
326
UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = {
327
'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
328
'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
329
'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
330
'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
331
'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
332
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
333
'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
334
'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
335
'\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
336
'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
337
'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
338
'\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
339
'\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
340
'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
341
'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
342
'\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
343
'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
344
'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
345
'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
346
'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
347
'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
348
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
349
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
350
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
351
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
352
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
353
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327',
354
'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337',
355
'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
356
'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
357
'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
358
'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377'
361
#ifdef USE_UPPER_CASE_TABLE
362
UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = {
363
'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
364
'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
365
'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
366
'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
367
'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
368
'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
369
'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
370
'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
371
'\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
372
'\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
373
'\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
374
'\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137',
375
'\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107',
376
'\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117',
377
'\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127',
378
'\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177',
379
'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
380
'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
381
'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
382
'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
383
'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
384
'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
385
'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
386
'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
387
'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
388
'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
389
'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
390
'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
391
'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
392
'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
393
'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367',
394
'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377',
399
onigenc_set_default_caseconv_table(const UChar* table)
401
if (table == (const UChar* )0) {
402
#ifndef USE_APPLICATION_TO_LOWER_CASE_TABLE
403
table = BuiltInAsciiToLowerCaseTable;
409
if (table != OnigEncAsciiToLowerCaseTable) {
410
OnigEncAsciiToLowerCaseTable = table;
415
onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s)
417
return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s);
420
OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = {
477
onigenc_ascii_get_all_pair_ambig_codes(OnigAmbigType flag,
478
OnigPairAmbigCodes** ccs)
480
if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) {
481
*ccs = OnigAsciiPairAmbigCodes;
482
return (sizeof(OnigAsciiPairAmbigCodes) / sizeof(OnigPairAmbigCodes));
490
onigenc_nothing_get_all_comp_ambig_codes(OnigAmbigType flag,
491
OnigCompAmbigCodes** ccs)
497
onigenc_iso_8859_1_get_all_pair_ambig_codes(OnigAmbigType flag,
498
OnigPairAmbigCodes** ccs)
500
static OnigPairAmbigCodes cc[] = {
566
if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) {
567
*ccs = OnigAsciiPairAmbigCodes;
568
return (sizeof(OnigAsciiPairAmbigCodes) / sizeof(OnigPairAmbigCodes));
570
else if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) {
572
return sizeof(cc) / sizeof(OnigPairAmbigCodes);
579
onigenc_ess_tsett_get_all_comp_ambig_codes(OnigAmbigType flag,
580
OnigCompAmbigCodes** ccs)
582
static OnigCompAmbigCodes folds[] = {
583
{ 2, 0xdf, {{ 2, { 0x53, 0x53 } }, { 2, { 0x73, 0x73} } } }
586
if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) {
588
return sizeof(folds) / sizeof(OnigCompAmbigCodes);
595
onigenc_not_support_get_ctype_code_range(int ctype,
596
OnigCodePoint* sbr[], OnigCodePoint* mbr[])
598
return ONIG_NO_SUPPORT_CONFIG;
602
onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end)
605
if (*p == 0x0a) return 1;
610
/* for single byte encodings */
612
onigenc_ascii_mbc_to_normalize(OnigAmbigType flag, const UChar** p, const UChar*end,
615
if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
616
*lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p);
623
return 1; /* return byte length of converted char to lower */
627
onigenc_ascii_is_mbc_ambiguous(OnigAmbigType flag,
628
const UChar** pp, const UChar* end)
630
const UChar* p = *pp;
633
if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
634
return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
642
onigenc_single_byte_mbc_enc_len(const UChar* p)
648
onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end)
650
return (OnigCodePoint )(*p);
654
onigenc_single_byte_code_to_mbclen(OnigCodePoint code)
660
onigenc_single_byte_code_to_mbc_first(OnigCodePoint code)
662
return (code & 0xff);
666
onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf)
668
*buf = (UChar )(code & 0xff);
673
onigenc_single_byte_left_adjust_char_head(const UChar* start, const UChar* s)
679
onigenc_always_true_is_allowed_reverse_match(const UChar* s, const UChar* end)
685
onigenc_always_false_is_allowed_reverse_match(const UChar* s, const UChar* end)
691
onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
696
len = enc_len(enc, p);
697
n = (OnigCodePoint )(*p++);
698
if (len == 1) return n;
700
for (i = 1; i < len; i++) {
709
onigenc_mbn_mbc_to_normalize(OnigEncoding enc, OnigAmbigType flag,
710
const UChar** pp, const UChar* end, UChar* lower)
713
const UChar *p = *pp;
715
if (ONIGENC_IS_MBC_ASCII(p)) {
716
if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
717
*lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
726
len = enc_len(enc, p);
729
for (i = 0; i < len; i++) {
734
return len; /* return byte length of converted to lower char */
739
onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigAmbigType flag,
740
const UChar** pp, const UChar* end)
742
const UChar* p = *pp;
744
if (ONIGENC_IS_MBC_ASCII(p)) {
746
if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
747
return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p);
754
(*pp) += enc_len(enc, p);
759
onigenc_mb2_code_to_mbclen(OnigCodePoint code)
761
if ((code & 0xff00) != 0) return 2;
766
onigenc_mb4_code_to_mbclen(OnigCodePoint code)
768
if ((code & 0xff000000) != 0) return 4;
769
else if ((code & 0xff0000) != 0) return 3;
770
else if ((code & 0xff00) != 0) return 2;
775
onigenc_mb2_code_to_mbc_first(OnigCodePoint code)
779
if ((code & 0xff00) != 0) {
780
first = (code >> 8) & 0xff;
789
onigenc_mb4_code_to_mbc_first(OnigCodePoint code)
793
if ((code & 0xff000000) != 0) {
794
first = (code >> 24) & 0xff;
796
else if ((code & 0xff0000) != 0) {
797
first = (code >> 16) & 0xff;
799
else if ((code & 0xff00) != 0) {
800
first = (code >> 8) & 0xff;
809
onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
813
if ((code & 0xff00) != 0) {
814
*p++ = (UChar )((code >> 8) & 0xff);
816
*p++ = (UChar )(code & 0xff);
819
if (enc_len(enc, buf) != (p - buf))
820
return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
826
onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
830
if ((code & 0xff000000) != 0) {
831
*p++ = (UChar )((code >> 24) & 0xff);
833
if ((code & 0xff0000) != 0) {
834
*p++ = (UChar )((code >> 16) & 0xff);
836
if ((code & 0xff00) != 0) {
837
*p++ = (UChar )((code >> 8) & 0xff);
839
*p++ = (UChar )(code & 0xff);
842
if (enc_len(enc, buf) != (p - buf))
843
return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
849
onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
852
if ((ctype & ONIGENC_CTYPE_WORD) != 0) {
854
return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
856
return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
858
ctype &= ~ONIGENC_CTYPE_WORD;
859
if (ctype == 0) return FALSE;
863
return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
869
onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code,
872
if ((ctype & ONIGENC_CTYPE_WORD) != 0) {
874
return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
876
return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE);
878
ctype &= ~ONIGENC_CTYPE_WORD;
879
if (ctype == 0) return FALSE;
883
return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
889
onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end,
890
const UChar* sascii /* ascii */, int n)
895
if (p >= end) return (int )(*sascii);
897
c = (int )ONIGENC_MBC_TO_CODE(enc, p, end);
902
p += enc_len(enc, p);
907
#else /* ONIG_RUBY_M17N */
910
onigenc_is_code_ctype(OnigEncoding enc, OnigCodePoint code, int ctype)
913
case ONIGENC_CTYPE_NEWLINE:
914
if (code == 0x0a) return 1;
917
case ONIGENC_CTYPE_ALPHA:
918
return m17n_isalpha(enc, code);
920
case ONIGENC_CTYPE_BLANK:
921
return ONIGENC_IS_CODE_BLANK(enc, (int )(code));
923
case ONIGENC_CTYPE_CNTRL:
924
return m17n_iscntrl(enc, code);
926
case ONIGENC_CTYPE_DIGIT:
927
return m17n_isdigit(enc, code);
929
case ONIGENC_CTYPE_GRAPH:
930
return ONIGENC_IS_CODE_GRAPH(enc, (int )(code));
932
case ONIGENC_CTYPE_LOWER:
933
return m17n_islower(enc, code);
935
case ONIGENC_CTYPE_PRINT:
936
return m17n_isprint(enc, code);
938
case ONIGENC_CTYPE_PUNCT:
939
return m17n_ispunct(enc, code);
941
case ONIGENC_CTYPE_SPACE:
942
return m17n_isspace(enc, code);
944
case ONIGENC_CTYPE_UPPER:
945
return m17n_isupper(enc, code);
947
case ONIGENC_CTYPE_XDIGIT:
948
return m17n_isxdigit(enc, code);
950
case ONIGENC_CTYPE_WORD:
951
return m17n_iswchar(enc, code);
953
case ONIGENC_CTYPE_ASCII:
954
return (code < 128 ? TRUE : FALSE);
956
case ONIGENC_CTYPE_ALNUM:
957
return m17n_isalnum(enc, code);
967
onigenc_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf)
971
m17n_mbcput(enc, code, buf);
972
c = m17n_firstbyte(enc, code);
973
len = enc_len(enc, c);
978
onigenc_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* buf)
982
c = m17n_codepoint(enc, p, p + enc_len(enc, *p));
983
low = m17n_tolower(enc, c);
984
m17n_mbcput(enc, low, buf);
986
return m17n_codelen(enc, low);
990
onigenc_is_mbc_ambiguous(OnigEncoding enc, OnigAmbigType flag,
991
UChar** pp, UChar* end)
997
len = enc_len(enc, *p);
999
c = m17n_codepoint(enc, p, p + len);
1001
if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
1002
if (m17n_isupper(enc, c) || m17n_islower(enc, c))
1010
onigenc_get_left_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s)
1015
if (s <= start) return s;
1018
while (!m17n_islead(enc, *p) && p > start) p--;
1019
while (p + (len = enc_len(enc, *p)) < s) {
1022
if (p + len == s) return s;
1027
onigenc_is_allowed_reverse_match(OnigEncoding enc,
1028
const UChar* s, const UChar* end)
1030
return ONIGENC_IS_SINGLEBYTE(enc);
1034
onigenc_set_default_caseconv_table(UChar* table) { }
1036
#endif /* ONIG_RUBY_M17N */