1
/* find.c -- Searching and replacing
2
Copyright (C) 1993, 1994 John Harper <john@dcs.warwick.ac.uk>
3
$Id: find.c,v 1.33 2001/08/02 07:12:36 jsh Exp $
5
This file is part of Jade.
7
Jade is free software; you can redistribute it and/or modify it
8
under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 2, or (at your option)
12
Jade is distributed in the hope that it will be useful, but
13
WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
GNU General Public License for more details.
17
You should have received a copy of the GNU General Public License
18
along with Jade; see the file COPYING. If not, write to
19
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
30
/* Hooks for dealing with the rep_reg_obj match type. */
31
void (*rep_regsub_fun)(int, rep_regsubs *, char *, char *, void *);
32
int (*rep_regsublen_fun)(int, rep_regsubs *, char *, void *);
35
/* Compiling regexps. */
37
/* A linked list is used to store all recently-used regexps in MRU
38
order. At GC the regexps at the tail of the list are freed to
39
satisfy the size limit.
41
It might be better to use a hash-table. But by experience it seems
42
that the cache is usually quite small, and therefore searching the
43
list each compilation isn't too bad (and it makes the gc easier).
45
Also, the hit-ratio is very good (as I'm typing this, ~0.97) */
47
struct cached_regexp {
48
struct cached_regexp *next;
53
static struct cached_regexp *cached_regexps; /* should be a hash table? */
54
static int regexp_hits, regexp_misses;
55
static int regexp_cache_limit = 1024;
57
DEFSYM(regexp_error, "regexp-error");
58
DEFSTRING(err_regexp_error, "Regexp error");
61
rep_compile_regexp(repv re)
63
struct cached_regexp **x = &cached_regexps;
65
assert(rep_STRINGP(re));
66
re_len = rep_STRING_LEN(re);
69
repv saved_re = (*x)->regexp;
70
assert(rep_STRINGP(saved_re));
72
|| (rep_STRING_LEN(saved_re) == re_len
73
&& memcmp(rep_STR(saved_re), rep_STR(re), re_len) == 0))
75
/* Found it. Move this node to the head of the list. Then
76
return the compiled copy. */
77
struct cached_regexp *this = *x;
78
if(x != &cached_regexps)
81
this->next = cached_regexps;
82
cached_regexps = this;
85
return this->compiled;
90
/* No cached copy. Compile it, then add it to the cache. */
92
struct cached_regexp *this;
93
rep_regexp *compiled = rep_regcomp(rep_STR(re));
96
this = rep_alloc(sizeof(struct cached_regexp));
100
this->compiled = compiled;
101
this->next = cached_regexps;
102
cached_regexps = this;
104
rep_data_after_gc += (sizeof(struct cached_regexp)
105
+ compiled->regsize);
113
/* Remove any cached compilation of STRING from the regexp cache */
115
rep_string_modified (repv string)
117
struct cached_regexp **x;
118
for (x = &cached_regexps; *x != 0; x = &((*x)->next))
120
if ((*x)->regexp == string)
122
/* found the string, remove it from the cache */
123
struct cached_regexp *ptr = *x;
125
free (ptr->compiled);
133
mark_cached_regexps(void)
136
struct cached_regexp *x = cached_regexps, *xp = 0;
137
while(x != 0 && total < regexp_cache_limit)
139
assert(rep_STRINGP(x->regexp));
140
rep_MARKVAL(x->regexp);
141
total += sizeof(struct cached_regexp) + x->compiled->regsize;
147
/* Free all following regexps */
160
/* Free all cached regexps */
162
release_cached_regexps(void)
164
struct cached_regexp *x = cached_regexps;
168
struct cached_regexp *next = x->next;
176
/* Storing regexp context. */
178
/* Storage for remembering where the last match was.
179
last_match_data is the string or buffer that was matched against.
180
last_matches is a copy of the subexpression data of the last match. */
181
static rep_regtype last_match_type;
182
static repv last_match_data;
183
static rep_regsubs last_matches;
185
struct rep_saved_regexp_data *rep_saved_matches;
188
rep_update_last_match(repv data, rep_regexp *prog)
190
last_match_type = prog->lasttype;
191
last_match_data = data;
192
memcpy(&last_matches, &prog->matches, sizeof(last_matches));
197
rep_mark_regexp_data(void)
199
struct rep_saved_regexp_data *sd;
201
/* Don't keep too many cached REs through GC. */
202
mark_cached_regexps();
204
if(last_match_type == rep_reg_obj)
207
for(i = 0; i < rep_NSUBEXP; i++)
209
rep_MARKVAL(last_matches.obj.startp[i]);
210
rep_MARKVAL(last_matches.obj.endp[i]);
213
rep_MARKVAL(last_match_data);
215
for(sd = rep_saved_matches; sd != 0; sd = sd->next)
217
if(sd->type == rep_reg_obj)
220
for(i = 0; i < rep_NSUBEXP; i++)
222
rep_MARKVAL(sd->matches.obj.startp[i]);
223
rep_MARKVAL(sd->matches.obj.endp[i]);
226
rep_MARKVAL(sd->data);
230
/* Fix the match buffers to reflect matching a string from START to END. */
232
rep_set_string_match(repv obj, repv start, repv end)
235
last_match_data = obj;
236
last_match_type = rep_reg_obj;
237
last_matches.obj.startp[0] = start;
238
last_matches.obj.endp[0] = end;
239
for(i = 1; i < rep_NSUBEXP; i++)
241
last_matches.obj.startp[i] = rep_NULL;
242
last_matches.obj.endp[i] = rep_NULL;
247
rep_push_regexp_data(struct rep_saved_regexp_data *sd)
249
sd->type = last_match_type;
250
sd->data = last_match_data;
251
memcpy(&sd->matches, &last_matches, sizeof(rep_regsubs));
252
sd->next = rep_saved_matches;
253
rep_saved_matches = sd;
257
rep_pop_regexp_data(void)
259
struct rep_saved_regexp_data *sd = rep_saved_matches;
260
rep_saved_matches = sd->next;
261
last_match_type = sd->type;
262
last_match_data = sd->data;
263
memcpy(&last_matches, &sd->matches, sizeof(rep_regsubs));
267
/* Simple string matching */
269
DEFUN("string-match", Fstring_match, Sstring_match, (repv re, repv str, repv start, repv nocasep), rep_Subr4) /*
270
::doc:rep.regexp#string-match::
271
string-match REGEXP STRING [START] [IGNORE-CASE-P]
273
Return t if REGEXP matches STRING. Updates the match data.
275
When defined, START is the index of the first character to start
276
matching at (counting from zero). When IGNORE-CASE-P is non-nil the
277
case of matched strings are ignored. Note that character classes are
278
still case-significant.
283
rep_DECLARE1(re, rep_STRINGP);
284
rep_DECLARE2(str, rep_STRINGP);
285
rep_DECLARE3_OPT(start, rep_INTP);
286
xstart = rep_INTP(start) ? rep_INT(start) : 0;
287
prog = rep_compile_regexp(re);
291
if(rep_regexec2(prog, rep_STR(str) + xstart,
292
(rep_NILP(nocasep) ? 0 : rep_REG_NOCASE)
293
| (xstart == 0 ? 0 : rep_REG_NOTBOL)))
295
rep_update_last_match(str, prog);
305
DEFUN("string-looking-at", Fstring_looking_at, Sstring_looking_at, (repv re, repv string, repv start, repv nocasep), rep_Subr4) /*
306
::doc:rep.regexp#string-looking-at::
307
string-looking-at REGEXP STRING [START] [IGNORE-CASE-P]
309
Returns t if REGEXP matches the STRING (starting at character START).
310
Updates the match data.
315
rep_DECLARE1(re, rep_STRINGP);
316
rep_DECLARE2(string, rep_STRINGP);
317
rep_DECLARE3_OPT(start, rep_INTP);
318
xstart = rep_INTP(start) ? rep_INT(start) : 0;
319
prog = rep_compile_regexp(re);
323
if(rep_regmatch_string(prog, rep_STR(string) + xstart,
324
(rep_NILP(nocasep) ? 0 : rep_REG_NOCASE)
325
| (xstart == 0 ? 0 : rep_REG_NOTBOL)))
327
rep_update_last_match(string, prog);
337
DEFUN("expand-last-match", Fexpand_last_match, Sexpand_last_match, (repv template), rep_Subr1) /*
338
::doc:rep.regexp#expand-last-match::
339
expand-last-match TEMPLATE-STRING
341
Expand the saved expressions from the most recent successfully matched
342
regexp according to TEMPLATE-STRING, a string that may contain any of
343
the following escape sequences,
345
\0, \& whole string matched by REGEXP
346
\N N'th parenthensized expression (1 <= N <= 9)
351
rep_DECLARE1(template, rep_STRINGP);
352
len = (*rep_regsublen_fun)(last_match_type, &last_matches,
353
rep_STR(template), rep_PTR(last_match_data));
354
string = rep_make_string(len);
355
(*rep_regsub_fun)(last_match_type, &last_matches,
356
rep_STR(template), rep_STR(string),
357
rep_PTR(last_match_data));
361
DEFUN("match-start", Fmatch_start, Smatch_start, (repv exp), rep_Subr1) /*
362
::doc:rep.regexp#match-start::
363
match-start [EXPRESSION-INDEX]
365
Return the position which the EXPRESSION-INDEX'th parenthesised expression
366
started at in the last successful regexp match. If EXPRESSION-INDEX is
367
nil or 0 the start of the whole match is returned instead.
368
The returned value will either be a position if the last match was in a
369
buffer, or an integer if the last match was in a string (i.e. regexp-match).
373
rep_DECLARE1_OPT(exp, rep_INTP);
377
if((i >= rep_NSUBEXP) || (i < 0))
378
return(rep_signal_arg_error(exp, 1));
382
if(last_match_type == rep_reg_obj)
384
if(last_matches.obj.startp[i] != rep_NULL)
385
return last_matches.obj.startp[i];
390
if(last_matches.string.startp[i] == NULL)
392
i = last_matches.string.startp[i] - (char *)rep_STR(last_match_data);
393
return(rep_MAKE_INT(i));
397
DEFUN("match-end", Fmatch_end, Smatch_end, (repv exp), rep_Subr1) /*
398
::doc:rep.regexp#match-end::
399
match-end [EXPRESSION-INDEX]
401
Return the position which the EXPRESSION-INDEX'th parenthesised expression
402
ended at in the last successful regexp match. If EXPRESSION-INDEX is
403
nil or 0 the end of the whole match is returned instead.
404
The returned value will either be a position if the last match was in a
405
buffer, or an integer if the last match was in a string (i.e. regexp-match).
409
rep_DECLARE1_OPT(exp, rep_INTP);
413
if((i >= rep_NSUBEXP) || (i < 0))
414
return rep_signal_arg_error(exp, 1);
418
if(last_match_type == rep_reg_obj)
420
if(last_matches.obj.endp[i] != rep_NULL)
421
return last_matches.obj.endp[i];
426
if(last_matches.string.endp[i] == NULL)
428
i = last_matches.string.endp[i] - (char *)rep_STR(last_match_data);
429
return(rep_MAKE_INT(i));
433
DEFUN("quote-regexp", Fquote_regexp, Squote_regexp, (repv str), rep_Subr1) /*
434
::doc:rep.regexp#quote-regexp::
437
Returns a new version of STRING, any characters which the regexp routines
438
treat specially (asterisks, square brackets, etc...) is quoted by the escape
439
character `\'. If the STRING does not contain any regexp meta-characters
440
it is returned as-is (un-copied).
444
int buflen = 128, slen, i = 0;
445
rep_bool quoted = rep_FALSE;
447
rep_DECLARE1(str, rep_STRINGP);
449
slen = rep_STRING_LEN(str);
450
buf = rep_alloc(buflen);
456
/* Ensure string is long enough, this saves doing this twice. */
459
int newlen = buflen * 2;
460
u_char *newbuf = rep_alloc(newlen);
463
memcpy(newbuf, buf, i);
481
case '\\': /* do I want to do this? */
482
/* quote this character */
495
res = rep_string_dupn(buf, i);
502
DEFUN("regexp-cache-control", Fregexp_cache_control,
503
Sregexp_cache_control, (repv limit), rep_Subr1) /*
504
::doc:rep.regexp#regexp-cache-control::
505
regexp-cache-control [SOFT-LIMIT]
507
If SOFT-LIMIT is defined, it specifies the maximum number of bytes that
508
the regexp cache may occupy after garbage collection.
510
Returns (SOFT-LIMIT CURRENT-SIZE CURRENT-ENTRIES HITS MISSES).
513
int current_size = 0, current_items = 0;
514
struct cached_regexp *x;
516
rep_DECLARE1_OPT(limit, rep_INTP);
517
if(rep_INTP(limit) && rep_INT(limit) >= 0)
518
regexp_cache_limit = rep_INT(limit);
524
current_size += sizeof(struct cached_regexp) + x->compiled->regsize;
528
return rep_list_5(rep_MAKE_INT(regexp_cache_limit),
529
rep_MAKE_INT(current_size), rep_MAKE_INT(current_items),
530
rep_MAKE_INT(regexp_hits), rep_MAKE_INT(regexp_misses));
534
rep_regerror(char *err)
536
Fsignal(Qregexp_error, rep_LIST_1(rep_string_dup(err)));
542
repv tem = rep_push_structure ("rep.regexp");
543
rep_ADD_SUBR(Sstring_match);
544
rep_ADD_SUBR(Sstring_looking_at);
545
rep_ADD_SUBR(Sexpand_last_match);
546
rep_ADD_SUBR(Smatch_start);
547
rep_ADD_SUBR(Smatch_end);
548
rep_ADD_SUBR(Squote_regexp);
549
rep_ADD_SUBR(Sregexp_cache_control);
550
rep_pop_structure (tem);
552
rep_INTERN(regexp_error); rep_ERROR(regexp_error);
553
rep_regsub_fun = rep_default_regsub;
554
rep_regsublen_fun = rep_default_regsublen;
560
release_cached_regexps();