4
* This file contains locale-specific regexp routines.
5
* This file is #included by regcomp.c.
7
* Copyright (c) 1998 by Scriptics Corporation.
9
* This software is copyrighted by the Regents of the University of
10
* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11
* Corporation and other parties. The following terms apply to all files
12
* associated with the software unless explicitly disclaimed in
15
* The authors hereby grant permission to use, copy, modify, distribute,
16
* and license this software and its documentation for any purpose, provided
17
* that existing copyright notices are retained in all copies and that this
18
* notice is included verbatim in any distributions. No written agreement,
19
* license, or royalty fee is required for any of the authorized uses.
20
* Modifications to this software may be copyrighted by their authors
21
* and need not follow the licensing terms described here, provided that
22
* the new terms are clearly indicated on the first page of each file where
25
* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26
* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27
* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28
* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29
* POSSIBILITY OF SUCH DAMAGE.
31
* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33
* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34
* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35
* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
38
* GOVERNMENT USE: If you are acquiring this software on behalf of the
39
* U.S. government, the Government shall have only "Restricted Rights"
40
* in the software and related documentation as defined in the Federal
41
* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42
* are acquiring the software on behalf of the Department of Defense, the
43
* software shall be classified as "Commercial Computer Software" and the
44
* Government shall have only "Restricted Rights" as defined in Clause
45
* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46
* authors grant the U.S. Government and others acting in its behalf
47
* permission to use and distribute the software in accordance with the
48
* terms specified in this license.
50
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.6 2004-05-07 00:24:57 tgl Exp $
53
/* ASCII character-name table */
123
"carriage-return", '\r'
195
"exclamation-mark", '!'
198
"quotation-mark", '"'
216
"left-parenthesis", '('
219
"right-parenthesis", ')'
285
"less-than-sign", '<'
291
"greater-than-sign", '>'
300
"left-square-bracket", '['
306
"reverse-solidus", '\\'
309
"right-square-bracket", ']'
315
"circumflex-accent", '^'
330
"left-curly-bracket", '{'
339
"right-curly-bracket", '}'
353
* some ctype functions with non-ascii-char guard
356
pg_wc_isdigit(pg_wchar c)
358
return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
362
pg_wc_isalpha(pg_wchar c)
364
return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
368
pg_wc_isalnum(pg_wchar c)
370
return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
374
pg_wc_isupper(pg_wchar c)
376
return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
380
pg_wc_islower(pg_wchar c)
382
return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
386
pg_wc_isgraph(pg_wchar c)
388
return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
392
pg_wc_isprint(pg_wchar c)
394
return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c));
398
pg_wc_ispunct(pg_wchar c)
400
return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
404
pg_wc_isspace(pg_wchar c)
406
return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
410
pg_wc_toupper(pg_wchar c)
412
if (c >= 0 && c <= UCHAR_MAX)
413
return toupper((unsigned char) c);
418
pg_wc_tolower(pg_wchar c)
420
if (c >= 0 && c <= UCHAR_MAX)
421
return tolower((unsigned char) c);
427
* nmcces - how many distinct MCCEs are there?
430
nmcces(struct vars * v)
433
* No multi-character collating elements defined at the moment.
439
* nleaders - how many chrs can be first chrs of MCCEs?
442
nleaders(struct vars * v)
448
* allmcces - return a cvec with all the MCCEs of the locale
451
allmcces(struct vars * v, /* context */
452
struct cvec * cv) /* this is supposed to have enough room */
454
return clearcvec(cv);
458
* element - map collating-element name to celt
461
element(struct vars * v, /* context */
462
chr *startp, /* points to start of name */
463
chr *endp) /* points just past end of name */
468
/* generic: one-chr names stand for themselves */
469
assert(startp < endp);
477
for (cn = cnames; cn->name != NULL; cn++)
479
if (strlen(cn->name) == len &&
480
pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
482
break; /* NOTE BREAK OUT */
485
if (cn->name != NULL)
486
return CHR(cn->code);
488
/* couldn't find it */
494
* range - supply cvec for a range, including legality check
497
range(struct vars * v, /* context */
498
celt a, /* range start */
499
celt b, /* range end, might equal a */
500
int cases) /* case-independent? */
508
if (a != b && !before(a, b))
516
cv = getcvec(v, 0, 1, 0);
523
* When case-independent, it's hard to decide when cvec ranges are
524
* usable, so for now at least, we won't try. We allocate enough
525
* space for two case variants plus a little extra for the two title
529
nchrs = (b - a + 1) * 2 + 4;
531
cv = getcvec(v, nchrs, 0, 0);
534
for (c = a; c <= b; c++)
537
lc = pg_wc_tolower((chr) c);
540
uc = pg_wc_toupper((chr) c);
549
* before - is celt x before celt y, for purposes of range legality?
551
static int /* predicate */
552
before(celt x, celt y)
554
/* trivial because no MCCEs */
561
* eclass - supply cvec for an equivalence class
562
* Must include case counterparts on request.
565
eclass(struct vars * v, /* context */
566
celt c, /* Collating element representing the
567
* equivalence class. */
568
int cases) /* all cases? */
572
/* crude fake equivalence class for testing */
573
if ((v->cflags & REG_FAKE) && c == 'x')
575
cv = getcvec(v, 4, 0, 0);
576
addchr(cv, (chr) 'x');
577
addchr(cv, (chr) 'y');
580
addchr(cv, (chr) 'X');
581
addchr(cv, (chr) 'Y');
586
/* otherwise, none */
588
return allcases(v, c);
589
cv = getcvec(v, 1, 0, 0);
596
* cclass - supply cvec for a character class
598
* Must include case counterparts on request.
601
cclass(struct vars * v, /* context */
602
chr *startp, /* where the name starts */
603
chr *endp, /* just past the end of the name */
604
int cases) /* case-independent? */
607
struct cvec *cv = NULL;
613
* The following arrays define the valid character class names.
616
static char *classNames[] = {
617
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
618
"lower", "print", "punct", "space", "upper", "xdigit", NULL
623
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
624
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
628
* Map the name to the corresponding enumerated value.
632
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
634
if (strlen(*namePtr) == len &&
635
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
648
* Remap lower and upper to alpha if the match is case insensitive.
652
((enum classes) index == CC_LOWER ||
653
(enum classes) index == CC_UPPER))
654
index = (int) CC_ALPHA;
657
* Now compute the character class contents.
659
* For the moment, assume that only char codes < 256 can be in these
663
switch ((enum classes) index)
666
cv = getcvec(v, UCHAR_MAX, 0, 0);
669
for (i = 0; i <= UCHAR_MAX; i++)
671
if (pg_wc_isprint((chr) i))
677
cv = getcvec(v, UCHAR_MAX, 0, 0);
680
for (i = 0; i <= UCHAR_MAX; i++)
682
if (pg_wc_isalnum((chr) i))
688
cv = getcvec(v, UCHAR_MAX, 0, 0);
691
for (i = 0; i <= UCHAR_MAX; i++)
693
if (pg_wc_isalpha((chr) i))
699
cv = getcvec(v, 0, 1, 0);
701
addrange(cv, 0, 0x7f);
704
cv = getcvec(v, 2, 0, 0);
709
cv = getcvec(v, 0, 2, 0);
710
addrange(cv, 0x0, 0x1f);
711
addrange(cv, 0x7f, 0x9f);
714
cv = getcvec(v, 0, 1, 0);
716
addrange(cv, (chr) '0', (chr) '9');
719
cv = getcvec(v, UCHAR_MAX, 0, 0);
722
for (i = 0; i <= UCHAR_MAX; i++)
724
if (pg_wc_ispunct((chr) i))
730
cv = getcvec(v, 0, 3, 0);
733
addrange(cv, '0', '9');
734
addrange(cv, 'a', 'f');
735
addrange(cv, 'A', 'F');
739
cv = getcvec(v, UCHAR_MAX, 0, 0);
742
for (i = 0; i <= UCHAR_MAX; i++)
744
if (pg_wc_isspace((chr) i))
750
cv = getcvec(v, UCHAR_MAX, 0, 0);
753
for (i = 0; i <= UCHAR_MAX; i++)
755
if (pg_wc_islower((chr) i))
761
cv = getcvec(v, UCHAR_MAX, 0, 0);
764
for (i = 0; i <= UCHAR_MAX; i++)
766
if (pg_wc_isupper((chr) i))
772
cv = getcvec(v, UCHAR_MAX, 0, 0);
775
for (i = 0; i <= UCHAR_MAX; i++)
777
if (pg_wc_isgraph((chr) i))
789
* allcases - supply cvec for all case counterparts of a chr (including itself)
791
* This is a shortcut, preferably an efficient one, for simple characters;
792
* messy cases are done via range().
795
allcases(struct vars * v, /* context */
796
chr pc) /* character to get case equivs of */
803
lc = pg_wc_tolower((chr) c);
804
uc = pg_wc_toupper((chr) c);
806
cv = getcvec(v, 2, 0, 0);
814
* cmp - chr-substring compare
816
* Backrefs need this. It should preferably be efficient.
817
* Note that it does not need to report anything except equal/unequal.
818
* Note also that the length is exact, and the comparison should not
819
* stop at embedded NULs!
821
static int /* 0 for equal, nonzero for unequal */
822
cmp(const chr *x, const chr *y, /* strings to compare */
823
size_t len) /* exact length of comparison */
825
return memcmp(VS(x), VS(y), len * sizeof(chr));
829
* casecmp - case-independent chr-substring compare
831
* REG_ICASE backrefs need this. It should preferably be efficient.
832
* Note that it does not need to report anything except equal/unequal.
833
* Note also that the length is exact, and the comparison should not
834
* stop at embedded NULs!
836
static int /* 0 for equal, nonzero for unequal */
837
casecmp(const chr *x, const chr *y, /* strings to compare */
838
size_t len) /* exact length of comparison */
840
for (; len > 0; len--, x++, y++)
842
if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))