1
/* This file is part of Malaga, a system for Natural Language Analysis.
2
* Copyright (C) 1995-1999 Bjoern Beutel
5
* Universitaet Erlangen-Nuernberg
6
* Abteilung fuer Computerlinguistik
9
* e-mail: malaga@linguistik.uni-erlangen.de
11
* This program is free software; you can redistribute it and/or modify
12
* it under the terms of the GNU General Public License as published by
13
* the Free Software Foundation; either version 2 of the License, or
14
* (at your option) any later version.
16
* This program is distributed in the hope that it will be useful,
17
* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
* GNU General Public License for more details.
21
* You should have received a copy of the GNU General Public License
22
* along with this program; if not, write to the Free Software
23
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
25
/* description ==============================================================*/
27
/* This module contains function to compile and execute pattern matching
30
/* includes =================================================================*/
43
/* constants ================================================================*/
45
#define SPECIAL_CHARS ".[]-^()*?+|\\" /* characters with a special meaning */
47
#define PATTERN_STACK_MAX 50 /* maximum size of pattern stack */
49
/* These are the instructions for matching a pattern.
51
* A pattern is a 0-terminated sequence of CHARs, defined as follows:
52
* C[] is the code vector.
53
* PC means pattern counter (pointer to char following a PAT_ instruction).
54
* S[] is the string to be examined.
55
* I is the index into the string.
56
* CS[] is the code stack and IS[] is the string index stack.
57
* SP is the stack pointer (same for RS[] and IS[]).
58
* VS[] and VE[] are vectors to store the index of start and end of vars. */
62
PAT_JUMP = 1, /* PC += (byte_t) C[PC]; */
63
PAT_JUMP_NOW, /* SP++; CS[SP] = PC+1; IS[SP] = I; PC += (byte_t) C[PC]; */
64
PAT_JUMP_LATER, /* SP++; CS[SP] = PC + (byte_t) C[PC]; IS[SP] = I; PC++; */
65
PAT_MATCH_ANY, /* if S[I] != EOS
68
PAT_MATCH_CLASS, /* if (S[I] in {C[PC+1],..,C[PC+C[PC]})
69
* then I++; PC += C[PC]+1;
71
PAT_MATCH_NOT_CLASS, /* if (S[I] in {C[PC+1],..,C[PC+C[PC]})
73
* else I++; PC += C[PC]+1; */
74
PAT_START_VAR_0, /* VS[0] = I; */
75
PAT_START_VAR_1, /* VS[1] = I; */
76
PAT_START_VAR_2, /* VS[2] = I; */
77
PAT_START_VAR_3, /* VS[3] = I; */
78
PAT_START_VAR_4, /* VS[4] = I; */
79
PAT_END_VAR_0, /* VE[0] = I; */
80
PAT_END_VAR_1, /* VE[1] = I; */
81
PAT_END_VAR_2, /* VE[2] = I; */
82
PAT_END_VAR_3, /* VE[3] = I; */
83
PAT_END_VAR_4 /* VE[4] = I; */
86
/* forwards =================================================================*/
88
FORWARD void local_compile_pattern (text_t text,
90
bool_t *may_be_empty);
92
/* functions ================================================================*/
94
LOCAL bool_t is_pattern_char (string_t s)
96
return ((s[0] == '\\' && s[1] != EOS && strchr (SPECIAL_CHARS, s[1]) != NULL)
97
|| (IS_PRINT (s[0]) && strchr (SPECIAL_CHARS, s[0]) == NULL));
100
/*---------------------------------------------------------------------------*/
102
LOCAL char pattern_char (string_t *string_ptr)
103
/* See if *<string_ptr> points to a valid char or to an escape sequence.
104
* Return the character code. Show an error if not valid. */
106
string_t s = *string_ptr;
109
if (s[0] == '\\' && s[1] != EOS && strchr (SPECIAL_CHARS, s[1]) != NULL)
114
else if (IS_PRINT (s[0]) && strchr (SPECIAL_CHARS, s[0]) == NULL)
119
else if (s[0] == EOS)
120
error ("unexpected end of pattern");
122
error ("invalid char \"%c\" in pattern", s[0]);
128
/*---------------------------------------------------------------------------*/
130
LOCAL char offset (int_t offset)
131
/* Return <offset> as a char. */
133
/* See if offset can be stored in a char. */
134
if ((char) offset != offset || offset == 0)
135
error ("pattern too complex");
137
return (char) offset;
140
/*---------------------------------------------------------------------------*/
142
LOCAL void compile_char_class (text_t text, string_t *string_ptr)
143
/* Compile a character class at <*string_ptr>.
144
* Save the resulting pattern in <text>. */
146
string_t s = *string_ptr;
153
add_char_to_text (text, PAT_MATCH_NOT_CLASS);
156
add_char_to_text (text, PAT_MATCH_CLASS);
158
patch_index = text_length (text);
160
do /* Read chars and ranges. */
165
error ("missing \"]\" in pattern");
167
ca = ORD (pattern_char (&s));
171
ce = ORD (pattern_char (&s));
173
error ("invalid range \"%c-%c\" in pattern", (char) ca, (char) ce);
178
for (c = ca; c <= ce; c++)
179
add_char_to_text (text, (char) c);
184
insert_char_in_text (text, offset (text_length (text) - patch_index),
188
/*---------------------------------------------------------------------------*/
190
LOCAL void compile_atom (text_t text,
191
string_t *string_ptr,
192
bool_t *may_be_empty)
193
/* Compile an atom at <*string_ptr>.
194
* Save the resulting pattern in <text>.
195
* <may_be_empty> == TRUE iff the atom may match an empty string. */
197
string_t s = *string_ptr;
199
bool_t local_may_be_empty;
201
*may_be_empty = TRUE;
204
local_may_be_empty = FALSE;
205
start = text_length (text);
208
compile_char_class (text, &s);
212
add_char_to_text (text, PAT_MATCH_ANY);
217
local_compile_pattern (text, &s, &local_may_be_empty);
219
error ("missing \")\" in pattern");
221
else if (is_pattern_char (s))
222
add_char_to_text (text, pattern_char (&s));
226
length = text_length (text) - start;
228
/* There may be a postfix operator following parentheses. */
232
insert_char_in_text (text, PAT_JUMP_NOW, start);
233
insert_char_in_text (text, offset (2 + length), start + 1);
238
if (local_may_be_empty)
239
error ("pattern argument for \"*\" may be empty");
240
insert_char_in_text (text, PAT_JUMP, start);
241
insert_char_in_text (text, offset (2 + length), start + 1);
242
add_char_to_text (text, PAT_JUMP_LATER);
243
add_char_to_text (text, offset (- length));
248
if (local_may_be_empty)
249
error ("pattern argument for \"+\" may be empty");
250
add_char_to_text (text, PAT_JUMP_LATER);
251
add_char_to_text (text, offset (- length));
252
*may_be_empty = FALSE;
255
*may_be_empty &= local_may_be_empty;
261
/*---------------------------------------------------------------------------*/
263
LOCAL void local_compile_pattern (text_t text,
264
string_t *string_ptr,
265
bool_t *may_be_empty)
266
/* Convert <string> to a pattern to be used as input to "match_pattern".
267
* If <pattern_var_no> != -1, mark the pattern so the string matching this
268
* pattern will be stored in <pattern_var[pattern_var_no]>.
269
* The result pattern must be freed with "free" after usage. */
271
string_t s = *string_ptr;
272
int_t alternative_length, alternative_start;
273
bool_t local_may_be_empty;
275
alternative_start = text_length (text);
276
compile_atom (text, &s, may_be_empty);
277
alternative_length = text_length (text) - alternative_start;
283
/* Add jump from start of last alternative to start of this alternative. */
284
insert_char_in_text (text, PAT_JUMP_LATER, alternative_start++);
285
insert_char_in_text (text, alternative_length + 4, alternative_start++);
287
alternative_start = text_length (text);
288
compile_atom (text, &s, &local_may_be_empty);
289
alternative_length = text_length (text) - alternative_start;
291
*may_be_empty |= local_may_be_empty;
293
/* Add from end of last alternative to end of this alternative. */
294
insert_char_in_text (text, PAT_JUMP, alternative_start++);
296
insert_char_in_text (text, alternative_length + 4, alternative_start++);
298
insert_char_in_text (text, alternative_length + 2, alternative_start++);
304
/*---------------------------------------------------------------------------*/
306
GLOBAL string_t compile_pattern (string_t string, int_t pattern_var_no)
307
/* Convert <string> to a pattern to be used as input to "match_pattern".
308
* If <pattern_var_no> != -1, mark the pattern so the string matching this
309
* pattern will be stored in <pattern_var[pattern_var_no]>.
310
* The result pattern must be freed with "free" after usage. */
312
text_t text = new_text ();
315
if (pattern_var_no != -1)
316
add_char_to_text (text, PAT_START_VAR_0 + pattern_var_no);
318
local_compile_pattern (text, &string, &may_be_empty);
320
error ("illegal char \"%c\" in pattern", *string);
322
if (pattern_var_no != -1)
323
add_char_to_text (text, PAT_END_VAR_0 + pattern_var_no);
325
return text_to_string (&text);
328
/*---------------------------------------------------------------------------*/
330
GLOBAL bool_t match_pattern (string_t string, string_t pattern)
331
/* Test whether <string> matches <pattern> (a string of chars compiled with
332
* "compile_pattern") and set substrings in <pattern_var>.
333
* The substrings can be freed with "free_mem". */
335
struct {string_t string, pattern;} stack[PATTERN_STACK_MAX]; /* backtrack */
336
struct {string_t start; string_t end;} var[PATTERN_VAR_MAX];
338
bool_t found_mismatch;
341
found_mismatch = FALSE;
343
for (i = 0; i < PATTERN_VAR_MAX; i++)
344
var[i].start = var[i].end = NULL;
346
while (! found_mismatch)
348
char code = *pattern++;
355
for (i = 0; i < PATTERN_VAR_MAX; i++)
357
if (var[i].start != NULL && var[i].end != NULL)
359
free_mem (pattern_var + i);
360
pattern_var[i] = new_string (var[i].start, var[i].end);
366
found_mismatch = TRUE;
370
pattern += (byte_t) *pattern - 1;
374
if (sp == PATTERN_STACK_MAX)
375
error ("match pattern is too complex");
377
stack[sp].string = string;
378
stack[sp].pattern = pattern + 1;
380
pattern += (byte_t) *pattern - 1;
384
if (sp == PATTERN_STACK_MAX)
385
error ("match pattern is too complex");
387
stack[sp].string = string;
388
stack[sp].pattern = pattern + (byte_t) *pattern - 1;
394
if (*string++ == EOS)
395
found_mismatch = TRUE;
398
case PAT_MATCH_CLASS:
400
found_mismatch = TRUE;
406
pattern += (byte_t) *pattern + 1;
407
while (index < pattern && TO_LOWER (*string) != *index)
411
if (index >= pattern)
412
found_mismatch = TRUE;
416
case PAT_MATCH_NOT_CLASS:
418
found_mismatch = TRUE;
424
pattern += (byte_t) *pattern + 1;
425
while (index < pattern && TO_LOWER (*string) != *index)
430
found_mismatch = TRUE;
434
case PAT_START_VAR_0:
435
case PAT_START_VAR_1:
436
case PAT_START_VAR_2:
437
case PAT_START_VAR_3:
438
case PAT_START_VAR_4:
439
var[code - PAT_START_VAR_0].start = string;
447
var[code - PAT_END_VAR_0].end = string;
451
if (code != TO_LOWER (*string))
452
found_mismatch = TRUE;
457
/* If this path was not successful and there is another path, try it. */
458
if (found_mismatch && sp > 0)
461
string = stack[sp].string;
462
pattern = stack[sp].pattern;
463
found_mismatch = FALSE;
470
/* end of file ==============================================================*/