2
# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $
4
# Copyright 1997, 1998, 1999 Computing Research Labs,
5
# New Mexico State University
7
# Permission is hereby granted, free of charge, to any person obtaining a
8
# copy of this software and associated documentation files (the "Software"),
9
# to deal in the Software without restriction, including without limitation
10
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
11
# and/or sell copies of the Software, and to permit persons to whom the
12
# Software is furnished to do so, subject to the following conditions:
14
# The above copyright notice and this permission notice shall be included in
15
# all copies or substantial portions of the Software.
17
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
21
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
22
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
23
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
Unicode and Regular Expressions
30
This is a simple regular expression package for matching against Unicode text
31
in UCS2 form. The implementation of this URE package is a variation on the
32
RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark
33
Hopkins' algorithm had the virtue of being very simple, so it was used as a
36
---------------------------------------------------------------------------
40
o Regular expression and text already normalized.
42
o Conversion to lower case assumes a 1-1 mapping.
46
Separator - any one of U+2028, U+2029, '\n', '\r'.
49
. - match any character.
50
* - match zero or more of the last subexpression.
51
+ - match one or more of the last subexpression.
52
? - match zero or one of the last subexpression.
53
() - subexpression grouping.
57
o The "." operator normally does not match separators, but a flag is
58
available for the ure_exec() function that will allow this operator to
61
Literals and Constants:
63
c - literal UCS2 character.
64
\x.... - hexadecimal number of up to 4 digits.
65
\X.... - hexadecimal number of up to 4 digits.
66
\u.... - hexadecimal number of up to 4 digits.
67
\U.... - hexadecimal number of up to 4 digits.
71
[...] - Character class.
72
[^...] - Negated character class.
73
\pN1,N2,...,Nn - Character properties class.
74
\PN1,N2,...,Nn - Negated character properties class.
76
POSIX character classes recognized:
92
o Character property classes are \p or \P followed by a comma separated
93
list of integers between 1 and 32. These integers are references to
94
the following character properties:
97
--------------------------
131
o Character classes can contain literals, constants, and character
132
property classes. Example:
136
---------------------------------------------------------------------------
140
Before URE is used, two functions need to be created. One to check if a
141
character matches a set of URE character properties, and one to convert a
142
character to lower case.
144
Stubs for these function are located in the urestubs.c file.
149
Sample pseudo-code fragment.
154
unsigned long relen, textlen;
155
unsigned long match_start, match_end;
158
* Allocate the dynamic storage needed to compile regular expressions.
160
rebuf = ure_buffer_create();
162
for each regular expression in a list {
163
re = next regular expression;
167
* Compile the regular expression with the case insensitive flag
170
dfa = ure_compile(re, relen, 1, rebuf);
173
* Look for the first match in some text. The matching will be done
174
* in a case insensitive manner because the expression was compiled
175
* with the case insensitive flag on.
177
if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
178
printf("MATCH: %ld %ld\n", match_start, match_end);
181
* Look for the first match in some text, ignoring non-spacing
184
if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
185
&match_start, &match_end))
186
printf("MATCH: %ld %ld\n", match_start, match_end);
195
* Free the dynamic storage used for compiling the expressions.
197
ure_free_buffer(rebuf);
199
---------------------------------------------------------------------------
201
Mark Leisher <mleisher@crl.nmsu.edu>
204
===========================================================================
210
Date : 21 September 1999
211
==========================
212
1. Added copyright stuff and put in CVS.