81
85
typedef std::list<regexp_elem>::const_iterator const_iterator;
87
#define MAXLOOKBEHIND 10
88
static bool lookbehind_warned = false;
84
91
octregexp_list (const octave_value_list &args, const std::string &nm,
85
92
bool case_insensitive, std::list<regexp_elem> &lst,
86
string_vector &named, int &nopts)
93
string_vector &named, int &nopts, bool &once)
89
96
#if defined (HAVE_REGEX) || defined (HAVE_PCRE)
90
97
int nargin = args.length();
92
98
bool lineanchors = false;
93
99
bool dotexceptnewline = false;
94
100
bool freespacing = false;
96
102
nopts = nargin - 2;
98
105
std::string buffer = args(0).string_value ();
106
size_t max_length = (buffer.length () > MAXLOOKBEHIND ?
107
MAXLOOKBEHIND: buffer.length ());
101
111
gripe_wrong_type_arg (nm.c_str(), args(0));
204
208
std::ostringstream buf;
205
209
Array<int> named_idx;
207
while ((new_pos = pattern.find ("(?<",pos)) != NPOS)
211
while ((new_pos = pattern.find ("(?",pos)) != NPOS)
209
size_t tmp_pos = pattern.find_first_of ('>',new_pos);
213
error ("syntax error in pattern");
217
std::string tmp_name = pattern.substr(new_pos+3,tmp_pos-new_pos-3);
220
for (int i = 0; i < nnames; i++)
221
if (named(i) == tmp_name)
223
named_idx.resize(inames+1);
224
named_idx(inames) = i;
230
named_idx.resize(inames+1);
231
named_idx(inames) = nnames;
232
named.append(tmp_name);
236
if (new_pos - pos > 0)
237
buf << pattern.substr(pos,new_pos-pos);
239
buf << "(?P<n00" << inames++;
240
else if (inames < 100)
241
buf << "(?P<n0" << inames++;
213
if (pattern.at (new_pos + 2) == '<' &&
214
!(pattern.at (new_pos + 3) == '=' ||
215
pattern.at (new_pos + 3) == '!'))
217
// The syntax of named tokens in pcre is "(?P<name>...)" while
218
// we need a syntax "(?<name>...)", so fix that here. Also an
220
// "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)"
221
// should be perfectly legal, while pcre does not allow the same
222
// named token name on both sides of the alternative. Also fix
223
// that here by replacing name tokens by dummy names, and dealing
224
// with the dummy names later.
226
size_t tmp_pos = pattern.find_first_of ('>',new_pos);
230
error ("syntax error in pattern");
234
std::string tmp_name =
235
pattern.substr(new_pos+3,tmp_pos-new_pos-3);
238
for (int i = 0; i < nnames; i++)
239
if (named(i) == tmp_name)
241
named_idx.resize(inames+1);
242
named_idx(inames) = i;
248
named_idx.resize(inames+1);
249
named_idx(inames) = nnames;
250
named.append(tmp_name);
254
if (new_pos - pos > 0)
255
buf << pattern.substr(pos,new_pos-pos);
257
buf << "(?P<n00" << inames++;
258
else if (inames < 100)
259
buf << "(?P<n0" << inames++;
261
buf << "(?P<n" << inames++;
264
else if (pattern.at (new_pos + 2) == '<')
266
// Find lookbehind operators of arbitrary length (ie like
267
// "(?<=[a-z]*)") and replace with a maximum length operator
268
// as PCRE can not yet handle arbitrary length lookahead
269
// operators. Use the string length as the maximum length to
273
size_t tmp_pos1 = new_pos + 2;
274
size_t tmp_pos2 = tmp_pos1;
275
while (tmp_pos1 <= pattern.length () && brackets > 0)
277
char ch = pattern.at (tmp_pos1);
292
buf << pattern.substr (pos, new_pos - pos) << "(?";
297
size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2);
298
if (tmp_pos3 != NPOS && tmp_pos3 < tmp_pos1)
300
if (!lookbehind_warned)
302
lookbehind_warned = true;
303
warning ("%s: arbitrary length lookbehind patterns are only support up to length %d", nm.c_str(), MAXLOOKBEHIND);
306
buf << pattern.substr (pos, new_pos - pos) << "(";
309
if (pattern.at (tmp_pos3) == '*')
314
for (; i < max_length + 1; i++)
316
buf <<pattern.substr(new_pos, tmp_pos3 - new_pos)
318
buf << pattern.substr(tmp_pos3 + 1,
319
tmp_pos1 - tmp_pos3 - 1);
326
buf << pattern.substr (pos, tmp_pos1 - pos);
243
buf << "(?P<n" << inames++;
332
buf << pattern.substr (pos, new_pos - pos) << "(?";
247
338
buf << pattern.substr(pos);
298
389
(idx ? PCRE_NOTBOL : 0),
299
390
ovector, (subpatterns+1)*3);
392
if (matches == PCRE_ERROR_MATCHLIMIT)
394
// try harder; start with default value for MATCH_LIMIT and increase it
395
warning("Your pattern caused PCRE to hit its MATCH_LIMIT.\nTrying harder now, but this will be slow.");
397
pcre_config(PCRE_CONFIG_MATCH_LIMIT, static_cast <void *> (&pe.match_limit));
398
pe.flags = PCRE_EXTRA_MATCH_LIMIT;
401
while (matches == PCRE_ERROR_MATCHLIMIT &&
402
i++ < PCRE_MATCHLIMIT_MAX)
406
pe.match_limit *= 10;
407
matches = pcre_exec(re, &pe, buffer.c_str(),
408
buffer.length(), idx,
409
(idx ? PCRE_NOTBOL : 0),
410
ovector, (subpatterns+1)*3);
301
414
if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
303
error ("%s: internal error calling pcre_exec", nm.c_str());
416
error ("%s: internal error calling pcre_exec\nError code from pcre_exec is %i", nm.c_str(), matches);
482
596
retval(5) = Octave_map();
485
Cell t (dim_vector(1, sz));
487
for (const_iterator p = lst.begin(); p != lst.end(); p++)
491
Cell m (dim_vector(1, sz));
493
for (const_iterator p = lst.begin(); p != lst.end(); p++)
498
Cell te (dim_vector(1, sz));
500
for (const_iterator p = lst.begin(); p != lst.end(); p++)
504
NDArray e (dim_vector(1, sz));
506
for (const_iterator p = lst.begin(); p != lst.end(); p++)
600
retval(4) = sz ? lst.front ().t : Cell();
603
Cell t (dim_vector(1, sz));
605
for (const_iterator p = lst.begin(); p != lst.end(); p++)
611
retval(3) = sz ? lst.front ().m : std::string();
614
Cell m (dim_vector(1, sz));
616
for (const_iterator p = lst.begin(); p != lst.end(); p++)
622
retval(2) = sz ? lst.front ().te : Matrix();
625
Cell te (dim_vector(1, sz));
627
for (const_iterator p = lst.begin(); p != lst.end(); p++)
635
retval(1) = lst.front ().e;
637
retval(1) = Matrix();
641
NDArray e (dim_vector(1, sz));
643
for (const_iterator p = lst.begin(); p != lst.end(); p++)
651
retval(0) = lst.front ().s;
653
retval(0) = Matrix();
510
657
NDArray s (dim_vector(1, sz));
512
659
for (const_iterator p = lst.begin(); p != lst.end(); p++)
516
664
// Alter the order of the output arguments
911
1070
%! [s, e, te, m, t] = regexp('short test string','\w*r\w*','once');
914
%! assert (size(te), [1,1])
915
%! assert (isempty(te{1}))
916
%! assert (m{1},'short')
917
%! ## Matlab gives [1,0] here but that seems wrong.
918
%! assert (size(t), [1,1])
1073
%! assert (isempty(te))
1074
%! assert (m,'short')
1075
%! assert (isempty(t))
921
1078
%! [m, te, e, s, t] = regexp('short test string','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
924
%! assert (size(te), [1,1])
925
%! assert (isempty(te{1}))
926
%! assert (m{1},'short')
927
%! ## Matlab gives [1,0] here but that seems wrong.
928
%! assert (size(t), [1,1])
1081
%! assert (isempty(te))
1082
%! assert (m,'short')
1083
%! assert (isempty(t))
930
1085
%!testif HAVE_PCRE
931
1086
%! ## This test is expected to fail if PCRE is not installed
1011
1166
%!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]})
1012
1167
%!assert(regexp('Strings',{'t','s'}),{2,7})
1169
## Test case for lookaround operators
1170
%!assert(regexp('Iraq','q(?!u)'),4)
1171
%!assert(regexp('quit','q(?!u)'), zeros(1,0))
1172
%!assert(regexp('quit','q(?=u)','match'), {'q'})
1173
%!assert(regexp("quit",'q(?=u+)','match'), {'q'})
1174
%!assert(regexp("qit",'q(?=u+)','match'), cell(1,0))
1175
%!assert(regexp("qit",'q(?=u*)','match'), {'q'})
1177
%!assert(regexp('thingamabob','(?<=a)b'), 9)
1016
1181
DEFUN_DLD (regexpi, args, nargout,
1087
1252
%! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*','once');
1088
1253
%! assert (s,1)
1089
1254
%! assert (e,5)
1090
%! assert (size(te), [1,1])
1091
%! assert (isempty(te{1}))
1092
%! assert (m{1},'ShoRt')
1093
%! ## Matlab gives [1,0] here but that seems wrong.
1094
%! assert (size(t), [1,1])
1255
%! assert (isempty(te))
1256
%! assert (m,'ShoRt')
1257
%! assert (isempty(t))
1097
1260
%! [m, te, e, s, t] = regexpi('ShoRt Test String','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
1098
1261
%! assert (s,1)
1099
1262
%! assert (e,5)
1100
%! assert (size(te), [1,1])
1101
%! assert (isempty(te{1}))
1102
%! assert (m{1},'ShoRt')
1103
%! ## Matlab gives [1,0] here but that seems wrong.
1104
%! assert (size(t), [1,1])
1263
%! assert (isempty(te))
1264
%! assert (m,'ShoRt')
1265
%! assert (isempty(t))
1106
1267
%!testif HAVE_PCRE
1107
1268
%! ## This test is expected to fail if PCRE is not installed
1540
1703
%!assert(regexprep({"abc","cba"},"b","?"),{"a?c","c?a"})
1541
1704
%!assert(regexprep({"abc","cba"},{"b","a"},{"?","!"}),{"!?c","c?!"})
1706
# Nasty lookbehind expression
1707
%!assert(regexprep('x^(-1)+y(-1)+z(-1)=0','(?<=[a-z]+)\(\-[1-9]*\)','_minus1'),'x^(-1)+y_minus1+z_minus1=0')