1
<?xml version="1.0" encoding="ISO-8859-1" ?>
2
<!DOCTYPE udf SYSTEM "udf_extension.dtd">
3
<extension name="regexp" version="0.9.5dev">
6
<name>Hartmut Holzgraefe</name>
7
<email>hartmut@mysql.com</email>
10
<license>GPL</license>
13
<version>0.1</version>
14
<date>2004-05-25</date>
26
<function name="regexp_like" returns="int" null="yes">
27
<param name="text" type="string"/>
28
<param name="pattern" type="string"/>
29
<param name="mode" type="string" optional="yes"/>
32
<element name="expr" type="my_regex_t"/>
33
<element name="dynamic" type="int" />
39
// static regex pattern -> we can compile it once and reuse it
43
// we have to make sure we have a NUL terminated C string
44
// as argument for my_regcomp
45
copy = strndup(pattern, pattern_len);
46
stat = my_regcomp(&data->expr, copy, parse_mode(mode), &my_charset_latin1);
50
sprintf(message, "regcomp failed (error: %d)", stat);
64
// free static compiler pattern
65
my_regfree(&data->expr);
77
copy = strndup(pattern, pattern_len);
78
stat = my_regcomp(&data->expr, copy, parse_mode(mode), &my_charset_latin1);
81
// TODO: need ERROR() and WARNING() macro
86
copy = strndup(text, text_len);
87
stat = my_regexec(&data->expr, copy, 1, &match, 0);
91
my_regfree(&data->expr);
94
if (stat && (stat != REG_NOMATCH)) {
95
fprintf(stderr, "regexec error %d '%s' '%s'\n", stat, pattern, text);
99
RETURN_INT(stat == REG_NOMATCH ? 0 : 1);
106
# testing simple cases
107
SELECT REGEXP_LIKE("xxxabcxxx", ".*abc.*") AS r1;
108
SELECT REGEXP_LIKE("xxxabdxxx", ".*abc.*") AS r2;
118
# testing case sensitivity
119
SELECT REGEXP_LIKE("xxxABCxxx", ".*abc.*") AS r3;
120
SELECT REGEXP_LIKE("xxxABCxxx", ".*abc.*", "i") AS r4;
130
# testing POSIX character classes
131
SELECT REGEXP_LIKE("abcdef", "^[[:alpha:]]+$") AS r1;
132
SELECT REGEXP_LIKE("123456", "^[[:alpha:]]+$") AS r2;
133
SELECT REGEXP_LIKE("123abcdef", "^[[:xdigit:]]+$") AS r3;
149
<function name="regexp_substr" returns="string" null="yes">
150
<param name="text" type="string"/>
151
<param name="pattern" type="string"/>
154
<element name="expr" type="my_regex_t"/>
155
<element name="dynamic" type="int" />
161
// static regex pattern -> we can compile it once and reuse it
165
// we have to make sure we have a NUL terminated C string
166
// as argument for my_regcomp
167
copy = strndup(pattern, pattern_len);
168
stat = my_regcomp(&data->expr, copy, REG_EXTENDED, &my_charset_latin1);
172
sprintf(message, "regcomp failed (error: %d)", stat);
185
if (!data->dynamic) {
186
// free static compiler pattern
187
my_regfree(&data->expr);
199
copy = strndup(pattern, pattern_len);
200
stat = my_regcomp(&data->expr, copy, REG_EXTENDED, &my_charset_latin1);
203
// TODO: need ERROR() and WARNING() macro
208
copy = strndup(text, text_len);
209
stat = my_regexec(&data->expr, copy, 1, &match, 0);
213
my_regfree(&data->expr);
217
if (stat != REG_NOMATCH) {
218
fprintf(stderr, "regexec error %d '%s' '%s'\n", stat, pattern, text);
223
RETURN_STRINGL(text + match.rm_so, match.rm_eo - match.rm_so);
229
SELECT REGEXP_SUBSTR("abc 123 def", "[[:digit:]]+") AS r1;
239
<function name="regexp_instr" returns="int" null="yes">
240
<param name="text" type="string"/>
241
<param name="pattern" type="string"/>
242
<param name="position" type="int" optional="yes" default="1"/>
243
<param name="occurrence" type="int" optional="yes" default="1"/>
244
<param name="return_end" type="int" optional="yes" default="0"/>
245
<param name="mode" type="string" optional="yes" default="c"/>
249
<element name="expr" type="my_regex_t"/>
250
<element name="dynamic" type="int" />
256
// static regex pattern -> we can compile it once and reuse it
260
// we have to make sure we have a NUL terminated C string
261
// as argument for my_regcomp
262
copy = strndup(pattern, pattern_len);
263
stat = my_regcomp(&data->expr, copy, parse_mode(mode), &my_charset_latin1);
267
sprintf(message, "regcomp failed (error: %d)", stat);
280
if (!data->dynamic) {
281
// free static compiler pattern
282
my_regfree(&data->expr);
294
position -= 1; /* oracle offsets start at 1, not 0 */
298
copy = strndup(pattern, pattern_len);
299
stat = my_regcomp(&data->expr, copy, parse_mode(mode), &my_charset_latin1);
302
// TODO: need ERROR() and WARNING() macro
307
copy = strndup(text, text_len);
310
position += match.rm_eo;
311
stat = my_regexec(&data->expr, copy + (size_t)position, 1, &match, 0);
312
} while ((stat == 0) && --occurrence > 0);
317
my_regfree(&data->expr);
321
fprintf(stderr, "regexec error %d '%s' '%s'\n", stat, pattern, text);
325
RETURN_INT(position + (return_end ? match.rm_eo : match.rm_so + 1));
330
SELECT REGEXP_INSTR("the quick brown fox jumps ...", "fox") AS r1;
337
SELECT REGEXP_INSTR("lala abc lala abc lala", "abc") AS r1;
338
SELECT REGEXP_INSTR("lala abc lala abc lala", "abc", 6) AS r2;
339
SELECT REGEXP_INSTR("lala abc lala abc lala", "abc", 7) AS r3;
350
SELECT REGEXP_INSTR("lala abc lala abc lala abc lala", "abc", 1, 1) AS r1;
351
SELECT REGEXP_INSTR("lala abc lala abc lala abc lala", "abc", 1, 2) AS r2;
352
SELECT REGEXP_INSTR("lala abc lala abc lala abc lala", "abc", 1, 3) AS r3;
353
SELECT REGEXP_INSTR("lala abc lala abc lala abc lala", "abc", 1, 4) AS r4;
366
# get character position of match start
367
SELECT REGEXP_INSTR("the quick brown fox jumps ...", "fox", 1, 1, 0) AS r1;
368
# get character position of match end
369
SELECT REGEXP_INSTR("the quick brown fox jumps ...", "fox", 1, 1, 1) AS r2;
370
# get character position of match end, use defauts for unused parameters
371
SELECT REGEXP_INSTR("the quick brown fox jumps ...", "fox", NULL, NULL, 1) AS r3;
385
<function name="regexp_replace" returns="string" null="yes">
386
<param name="text" type="string"/>
387
<param name="pattern" type="string"/>
388
<param name="replace" type="string"/>
391
char *c_pattern, *c_replace, *c_text;
394
c_pattern = strndup(pattern, pattern_len);
395
c_replace = strndup(replace, replace_len);
396
c_text = strndup(text, text_len);
398
result = my_regex_replace(c_pattern, c_replace, c_text);
405
RETURN_STRING(result);
412
SELECT REGEXP_REPLACE("lala foo lala", "foo", "bar") AS r1;
421
<code role="header" position="top">
425
#if MYSQL_VERSION_ID < 50000
426
#error need MySQL >= 5.0
430
#include <sys/types.h>
432
// TODO: my_regex.h is not installed by "make install"
433
#include <regex/my_regex.h>
435
// helper function borrowed from PHP, slightly modified
436
static char *my_regex_replace(const char *pattern, const char *replace, const char *string)
441
char *buf, /* buf is where we build the replaced string */
442
*nbuf, /* nbuf is used when we grow the buffer */
443
*walkbuf; /* used to walk buf when replacing backrefs */
444
const char *walk; /* used to walk replacement string for backrefs */
446
int pos, tmp, string_len, new_l;
447
int err, copts = REG_EXTENDED;
449
string_len = strlen(string);
451
err = my_regcomp(&re, pattern, copts, &my_charset_latin1);
456
/* allocate storage for (sub-)expression-matches */
457
subs = (my_regmatch_t *)calloc(sizeof(my_regmatch_t),re.re_nsub+1);
459
/* start with a buffer that is twice the size of the stringo
460
we're doing replacements in */
461
buf_len = 2 * string_len + 1;
462
buf = calloc(buf_len, sizeof(char));
467
err = my_regexec(&re, &string[pos], re.re_nsub+1, subs, (pos ? REG_NOTBOL : 0));
469
if (err && err != REG_NOMATCH) {
477
/* backref replacement is done in two passes:
478
1) find out how long the string will be, and allocate buf
479
2) copy the part before match, replacement and backrefs to buf
481
Jaakko Hyv�tti <Jaakko.Hyvatti@iki.fi>
484
new_l = strlen(buf) + subs[0].rm_so; /* part before the match */
487
if ('\\' == *walk && isdigit((unsigned char)walk[1]) && ((unsigned char)walk[1]) - '0' <= re.re_nsub) {
488
if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1) {
489
new_l += subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so;
497
if (new_l + 1 > buf_len) {
498
buf_len = 1 + buf_len + 2 * new_l;
499
nbuf = malloc(buf_len);
505
/* copy the part of the string before the match */
506
strncat(buf, &string[pos], subs[0].rm_so);
508
/* copy replacement and backrefs */
509
walkbuf = &buf[tmp + subs[0].rm_so];
512
if ('\\' == *walk && isdigit(walk[1]) && walk[1] - '0' <= (int)re.re_nsub) {
513
if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1
514
/* this next case shouldn't happen. it does. */
515
&& subs[walk[1] - '0'].rm_so <= subs[walk[1] - '0'].rm_eo) {
517
tmp = subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so;
518
memcpy (walkbuf, &string[pos + subs[walk[1] - '0'].rm_so], tmp);
523
*walkbuf++ = *walk++;
528
/* and get ready to keep looking for replacements */
529
if (subs[0].rm_so == subs[0].rm_eo) {
530
if (subs[0].rm_so + pos >= string_len) {
533
new_l = strlen (buf) + 1;
534
if (new_l + 1 > buf_len) {
535
buf_len = 1 + buf_len + 2 * new_l;
536
nbuf = calloc(buf_len, sizeof(char));
541
pos += subs[0].rm_eo + 1;
542
buf [new_l-1] = string [pos-1];
545
pos += subs[0].rm_eo;
547
} else { /* REG_NOMATCH */
548
new_l = strlen(buf) + strlen(&string[pos]);
549
if (new_l + 1 > buf_len) {
550
buf_len = new_l + 1; /* now we know exactly how long it is */
551
nbuf = calloc(buf_len, sizeof(char));
556
/* stick that last bit of string on our output */
557
strcat(buf, &string[pos]);
561
/* don't want to leak memory .. */
569
static int parse_mode(const char *mode)
571
int flags = REG_EXTENDED | REG_NEWLINE;
576
case 'i': flags |= REG_ICASE; break; /* case insensitive */
577
case 'c': flags &= ~REG_ICASE; break; /* case sensitive */
578
case 'n': break; /* . matches newline */
579
case 'm': break; /* multiple lines */
580
case 'x': break; /* ignore whitespace */
583
} while (*mode != '\0');
586
fprintf(stderr, "flags are %X\n", flags);