6
6
created at: Mon Aug 9 18:24:49 JST 1993
8
Copyright (C) 1993-2006 Yukihiro Matsumoto
8
Copyright (C) 1993-2007 Yukihiro Matsumoto
10
10
**********************************************************************/
12
#include "ruby/ruby.h"
14
#include "ruby/encoding.h"
14
15
#include "regint.h"
201
option_to_str(char str[4], int options)
204
if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
205
if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
206
if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
212
arg_kcode(int options)
214
switch (options & ARG_KCODE_MASK) {
215
case ARG_KCODE_NONE: return "n";
216
case ARG_KCODE_EUC: return "e";
217
case ARG_KCODE_SJIS: return "s";
218
case ARG_KCODE_UTF8: return "u";
227
case KCODE_NONE: return "n";
228
case KCODE_EUC: return "e";
229
case KCODE_SJIS: return "s";
230
case KCODE_UTF8: return "u";
198
236
rb_char_to_option_kcode(int c, int *option, int *kcode)
253
291
set_re_kcode_by_option(struct RRegexp *re, int options)
293
rb_encoding *enc = 0;
295
FL_UNSET(re, KCODE_MASK);
255
296
switch (options & ARG_KCODE_MASK) {
256
297
case ARG_KCODE_NONE:
257
FL_UNSET(re, KCODE_MASK);
298
enc = rb_enc_from_index(0);
299
FL_SET(re, KCODE_NONE);
258
300
FL_SET(re, KCODE_FIXED);
260
302
case ARG_KCODE_EUC:
261
FL_UNSET(re, KCODE_MASK);
303
enc = rb_enc_find("euc-jp");
262
304
FL_SET(re, KCODE_EUC);
263
305
FL_SET(re, KCODE_FIXED);
265
307
case ARG_KCODE_SJIS:
266
FL_UNSET(re, KCODE_MASK);
308
enc = rb_enc_find("sjis");
309
FL_SET(re, KCODE_FIXED);
267
310
FL_SET(re, KCODE_SJIS);
268
FL_SET(re, KCODE_FIXED);
270
312
case ARG_KCODE_UTF8:
271
FL_UNSET(re, KCODE_MASK);
313
enc = rb_enc_find("utf-8");
272
314
FL_SET(re, KCODE_UTF8);
273
315
FL_SET(re, KCODE_FIXED);
357
388
rb_reg_expr_str(VALUE str, const char *s, long len)
390
rb_encoding *enc = rb_enc_get(str);
359
391
const char *p, *pend;
360
392
int need_escape = 0;
362
394
p = s; pend = p + len;
364
if (*p == '/' || (!ISPRINT(*p) && !ismbchar(*p))) {
396
if (*p == '/' || (!rb_enc_isprint(*p, enc) && !ismbchar(p, enc))) {
370
402
if (!need_escape) {
371
403
rb_str_buf_cat(str, s, len);
384
416
rb_str_buf_cat(str, &c, 1);
385
417
rb_str_buf_cat(str, p, 1);
387
else if (ismbchar(*p)) {
388
rb_str_buf_cat(str, p, mbclen(*p));
419
else if (ismbchar(p, enc)) {
420
rb_str_buf_cat(str, p, mbclen(p, enc));
392
else if (ISPRINT(*p)) {
424
else if (rb_enc_isprint(*p, enc)) {
393
425
rb_str_buf_cat(str, p, 1);
395
else if (!ISSPACE(*p)) {
427
else if (!rb_enc_isspace(*p, enc)) {
398
430
sprintf(b, "\\%03o", *p & 0377);
414
446
rb_reg_expr_str(str, s, len);
415
447
rb_str_buf_cat2(str, "/");
417
450
rb_reg_check(re);
418
if (RREGEXP(re)->ptr->options & ONIG_OPTION_MULTILINE)
419
rb_str_buf_cat2(str, "m");
420
if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE)
421
rb_str_buf_cat2(str, "i");
422
if (RREGEXP(re)->ptr->options & ONIG_OPTION_EXTEND)
423
rb_str_buf_cat2(str, "x");
451
if (*option_to_str(opts, RREGEXP(re)->ptr->options))
452
rb_str_buf_cat2(str, opts);
425
454
if (FL_TEST(re, KCODE_FIXED)) {
426
switch ((RBASIC(re)->flags & KCODE_MASK)) {
428
rb_str_buf_cat2(str, "n");
431
rb_str_buf_cat2(str, "e");
434
rb_str_buf_cat2(str, "s");
437
rb_str_buf_cat2(str, "u");
455
rb_str_buf_cat2(str, opt_kcode(RBASIC(re)->flags & KCODE_MASK));
442
458
OBJ_INFECT(str, re);
578
if (options & ONIG_OPTION_MULTILINE) rb_str_buf_cat2(str, "m");
579
if (options & ONIG_OPTION_IGNORECASE) rb_str_buf_cat2(str, "i");
580
if (options & ONIG_OPTION_EXTEND) rb_str_buf_cat2(str, "x");
595
if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
582
597
if ((options & embeddable) != embeddable) {
583
rb_str_buf_cat2(str, "-");
584
if (!(options & ONIG_OPTION_MULTILINE)) rb_str_buf_cat2(str, "m");
585
if (!(options & ONIG_OPTION_IGNORECASE)) rb_str_buf_cat2(str, "i");
586
if (!(options & ONIG_OPTION_EXTEND)) rb_str_buf_cat2(str, "x");
599
option_to_str(optbuf + 1, ~options);
600
rb_str_buf_cat2(str, optbuf);
589
603
rb_str_buf_cat2(str, ":");
598
rb_reg_raise(const char *s, long len, const char *err, VALUE re, int ce)
612
rb_reg_raise(const char *s, long len, const char *err, VALUE re)
600
614
VALUE desc = rb_reg_desc(s, len, re);
603
rb_compile_error("%s: %s", err, RSTRING_PTR(desc));
605
rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
616
rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
620
rb_reg_error_desc(VALUE str, int options, const char *err)
623
VALUE desc = rb_str_buf_new2(err);
625
rb_str_buf_cat2(desc, ": /");
626
rb_reg_expr_str(desc, RSTRING_PTR(str), RSTRING_LEN(str));
628
option_to_str(opts + 1, options);
629
strlcat(opts, arg_kcode(options), sizeof(opts));
630
rb_str_buf_cat2(desc, opts);
631
return rb_exc_new3(rb_eRegexpError, desc);
635
rb_reg_raise_str(VALUE str, int options, const char *err)
637
rb_exc_raise(rb_reg_error_desc(str, options, err));
688
make_regexp(const char *s, long len, int flags, int ce)
720
make_regexp(const char *s, long len, int flags, onig_errmsg_buffer err)
691
char err[ONIG_MAX_ERROR_MESSAGE_LEN];
693
724
OnigErrorInfo einfo;
705
736
OnigDefaultSyntax);
707
738
onig_error_code_to_str((UChar*)err, r);
708
rb_reg_raise(s, len, err, 0, ce);
711
742
r = onig_compile(rp, (UChar*)s, (UChar*)(s + len), &einfo);
715
746
(void )onig_error_code_to_str((UChar*)err, r, &einfo);
716
rb_reg_raise(s, len, err, 0, ce);
924
954
reg->options, onigenc_get_default_encoding(),
925
955
OnigDefaultSyntax, &einfo);
927
onig_error_code_to_str((UChar*)err, r, &einfo);
928
rb_reg_raise((char* )pattern, RREGEXP(re)->len, err, re, Qfalse);
957
onig_error_code_to_str((UChar*)err, r, &einfo);
958
rb_reg_raise((char* )pattern, RREGEXP(re)->len, err, re);
931
961
RREGEXP(re)->ptr = reg2;
1019
char err[ONIG_MAX_ERROR_MESSAGE_LEN];
1049
onig_errmsg_buffer err;
1020
1050
onig_error_code_to_str((UChar*)err, result);
1021
rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, 0, Qfalse);
1051
rb_reg_raise(RREGEXP(re)->str, RREGEXP(re)->len, err, 0);
1076
1106
if (start == -1) return Qnil;
1077
1107
end = RMATCH(match)->END(nth);
1078
1108
len = end - start;
1079
str = rb_str_substr(RMATCH(match)->str, start, len);
1109
str = rb_str_subseq(RMATCH(match)->str, start, len);
1080
1110
OBJ_INFECT(str, match);
1107
1137
if (NIL_P(match)) return Qnil;
1108
1138
if (RMATCH(match)->BEG(0) == -1) return Qnil;
1109
str = rb_str_substr(RMATCH(match)->str, 0, RMATCH(match)->BEG(0));
1139
str = rb_str_subseq(RMATCH(match)->str, 0, RMATCH(match)->BEG(0));
1110
1140
if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1133
1163
if (RMATCH(match)->BEG(0) == -1) return Qnil;
1134
1164
str = RMATCH(match)->str;
1135
1165
pos = RMATCH(match)->END(0);
1136
str = rb_str_substr(str, pos, RSTRING_LEN(str) - pos);
1166
str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
1137
1167
if (OBJ_TAINTED(match)) OBJ_TAINT(str);
1190
1220
rb_ary_push(ary, Qnil);
1193
VALUE str = rb_str_substr(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1223
VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1194
1224
if (taint) OBJ_TAINT(str);
1195
1225
rb_ary_push(ary, str);
1385
1415
int taint = OBJ_TAINTED(match);
1387
1417
for (i=0; i<regs->num_regs; i++) {
1388
VALUE str = rb_str_substr(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1418
VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
1389
1419
if (taint) OBJ_TAINT(str);
1390
1420
if (RTEST(rb_yield(str))) {
1391
1421
rb_ary_push(result, str);
1434
1464
return RMATCH(match)->str; /* str is frozen */
1468
match_inspect(VALUE match)
1470
char *cname = rb_obj_classname(match);
1474
str = rb_str_buf_new2("#<");
1475
rb_str_buf_cat2(str, cname);
1477
for (i = 0; i < RMATCH(match)->regs->num_regs; i++) {
1479
rb_str_buf_cat2(str, " ");
1480
v = rb_reg_nth_match(i, match);
1482
rb_str_buf_cat2(str, "nil");
1484
rb_str_buf_append(str, rb_str_inspect(v));
1486
rb_str_buf_cat2(str, ">");
1437
1491
VALUE rb_cRegexp;
1440
rb_reg_initialize(VALUE obj, const char *s, long len,
1442
int ce) /* call rb_compile_error() */
1494
rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
1495
int options, onig_errmsg_buffer err)
1444
1497
struct RRegexp *re = RREGEXP(obj);
1456
set_re_kcode_by_option(re, options);
1509
if (options & ARG_KCODE_MASK) {
1510
set_re_kcode_by_option(re, options);
1513
rb_enc_associate((VALUE)re, enc);
1458
1516
if (options & ARG_KCODE_MASK) {
1459
1517
kcode_set_option((VALUE)re);
1462
1520
options |= ONIG_OPTION_IGNORECASE;
1463
1521
FL_SET(re, REG_CASESTATE);
1465
re->ptr = make_regexp(s, len, options & ARG_REG_OPTION_MASK, ce);
1523
re->ptr = make_regexp(s, len, options & ARG_REG_OPTION_MASK, err);
1524
if (!re->ptr) return -1;
1466
1525
re->str = ALLOC_N(char, len+1);
1467
1526
memcpy(re->str, s, len);
1468
1527
re->str[len] = '\0';
1470
1529
if (options & ARG_KCODE_MASK) {
1471
1530
kcode_reset_option();
1473
if (ce) FL_SET(obj, REG_LITERAL);
1536
rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
1538
return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str),
1490
rb_reg_new(const char *s, long len, int options)
1556
rb_reg_new(VALUE s, int options)
1492
1558
VALUE re = rb_reg_s_alloc(rb_cRegexp);
1494
rb_reg_initialize(re, s, len, options, Qfalse);
1559
onig_errmsg_buffer err;
1561
if (rb_reg_initialize_str(re, s, options, err) != 0) {
1562
rb_reg_raise_str(s, options, err);
1499
rb_reg_compile(const char *s, long len, int options)
1569
rb_reg_compile(VALUE str, int options)
1501
1571
VALUE re = rb_reg_s_alloc(rb_cRegexp);
1572
onig_errmsg_buffer err;
1503
rb_reg_initialize(re, s, len, options, Qtrue);
1574
if (!str) str = rb_str_new(0,0);
1575
if (rb_reg_initialize_str(re, str, options, err) != 0) {
1576
rb_set_errinfo(rb_reg_error_desc(str, options, err));
1579
FL_SET(re, REG_LITERAL);
1507
1583
static int case_cache;
1521
1597
case_cache = ruby_ignorecase;
1522
1598
kcode_cache = reg_kcode;
1523
return reg_cache = rb_reg_new(RSTRING_PTR(save_str), RSTRING_LEN(save_str),
1599
return reg_cache = rb_reg_new(save_str, ruby_ignorecase);
1782
1857
rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
1859
onig_errmsg_buffer err;
1788
1863
if (argc == 0 || argc > 3) {
1789
1864
rb_raise(rb_eArgError, "wrong number of arguments");
1797
1872
if (FL_TEST(argv[0], KCODE_FIXED)) {
1798
1873
flags |= re_to_kcode_arg_value(argv[0]);
1800
s = RREGEXP(argv[0])->str;
1801
len = RREGEXP(argv[0])->len;
1875
str = rb_enc_str_new(RREGEXP(argv[0])->str, RREGEXP(argv[0])->len,
1876
rb_enc_get(argv[0]));
1804
1879
if (argc >= 2) {
1811
1886
flags &= ~ARG_KCODE_MASK;
1812
1887
flags |= char_to_arg_kcode((int )kcode[0]);
1814
s = StringValuePtr(argv[0]);
1815
len = RSTRING_LEN(argv[0]);
1817
rb_reg_initialize(self, s, len, flags, Qfalse);
1891
if (rb_reg_initialize_str(self, str, flags, err) != 0) {
1892
rb_reg_raise_str(str, flags, err);
1822
1898
rb_reg_quote(VALUE str)
1900
rb_encoding *enc = rb_enc_get(str);
1824
1901
char *s, *send, *t;
1964
2041
return options;
2045
rb_check_regexp_type(VALUE re)
2047
return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
2052
* Regexp.try_convert(obj) -> re or nil
2054
* Try to convert <i>obj</i> into a Regexp, using to_regexp method.
2055
* Returns converted regexp or nil if <i>obj</i> cannot be converted
2058
* Regexp.try_convert(/re/) # => /re/
2059
* Regexp.try_convert("re") # => nil
2062
rb_reg_s_try_convert(VALUE dummy, VALUE re)
2064
return rb_check_regexp_type(re);
2007
2106
volatile VALUE v;
2009
2108
rb_str_buf_cat2(source, "|");
2010
v = rb_check_convert_type(argv[i], T_REGEXP, "Regexp", "to_regexp");
2109
v = rb_check_regexp_type(argv[i]);
2011
2110
if (!NIL_P(v)) {
2012
2111
if (FL_TEST(v, KCODE_FIXED)) {
2013
2112
if (kcode == -1) {
2033
2132
args[0] = source;
2034
2133
args[1] = Qnil;
2037
2135
args[2] = Qnil;
2040
args[2] = rb_str_new2("n");
2043
args[2] = rb_str_new2("e");
2046
args[2] = rb_str_new2("s");
2049
args[2] = rb_str_new2("u");
2138
args[2] = rb_str_new2(opt_kcode(kcode));
2052
2140
return rb_class_new_instance(3, args, rb_cRegexp);
2064
2156
rb_raise(rb_eTypeError, "wrong argument type");
2066
2158
rb_reg_check(re);
2067
rb_reg_initialize(copy, RREGEXP(re)->str, RREGEXP(re)->len,
2068
rb_reg_options(re), Qfalse);
2159
s = RREGEXP(re)->str;
2160
len = RREGEXP(re)->len;
2161
if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), err) != 0) {
2162
rb_reg_raise(s, len, err, re);
2076
2171
char *p, *s, *e;
2077
2172
unsigned char uc;
2174
rb_encoding *enc = rb_enc_check(str, src);
2176
rb_enc_check(str, regexp);
2081
2177
p = s = RSTRING_PTR(str);
2082
2178
e = s + RSTRING_LEN(str);
2084
2180
while (s < e) {
2087
uc = (unsigned char)*s++;
2089
s += mbclen(uc) - 1;
2183
if (ismbchar(ss, enc)) {
2184
s += mbclen(ss, enc) - 1;
2092
if (uc != '\\' || s == e) continue;
2187
if (*ss != '\\' || s == e) continue;
2095
2190
val = rb_str_buf_new(ss-p);
2119
2214
name_end = name = s + 1;
2120
2215
while (name_end < e) {
2121
2216
if (*name_end == '>') break;
2122
uc = (unsigned char)*name_end;
2123
name_end += mbclen(uc);
2217
name_end += mbclen(name_end, enc);
2125
2219
if (name_end < e) {
2126
2220
no = name_to_backref_number(regs, regexp, name, name_end);
2289
2383
* <code>MatchData</code> object.
2291
2385
* /c(.)t/ =~ 'cat' #=> 0
2292
* Regexp.last_match #=> #<MatchData:0x401b3d30>
2386
* Regexp.last_match #=> #<MatchData "cat" "a">
2293
2387
* Regexp.last_match(0) #=> "cat"
2294
2388
* Regexp.last_match(1) #=> "a"
2295
2389
* Regexp.last_match(2) #=> nil
2354
2448
rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, -1);
2355
2449
rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union, -1);
2356
2450
rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
2451
rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
2358
2453
rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
2359
2454
rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
2396
2491
rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
2397
2492
rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
2398
2493
rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
2399
rb_define_method(rb_cMatch, "inspect", rb_any_to_s, 0); /* in object.c */
2494
rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
2400
2495
rb_define_method(rb_cMatch, "string", match_string, 0);