~vcs-imports/mysql-udf-regexp/trunk

« back to all changes in this revision

Viewing changes to regexp.xml

Committer: hartmut
Date: 2006-09-27 03:11:49 UTC
Revision ID: vcs-imports@canonical.com-20060927031149-nitx03y6ytqcocyj

initial

Show diffs side-by-side

added added

removed removed

regexp.xml

<?xml version="1.0" encoding="ISO-8859-1" ?>

<!DOCTYPE udf SYSTEM "udf_extension.dtd">

<name>Hartmut Holzgraefe</name>

<email>hartmut@mysql.com</email>

</maintainer>

<notes>

First working release

</notes>

</release>

<deps>

<src/>

</deps>

<data>

</data>

<init>

<?data

if (pattern) {

// static regex pattern -> we can compile it once and reuse it

int stat;

char *copy;

// we have to make sure we have a NUL terminated C string

// as argument for my_regcomp

copy = strndup(pattern, pattern_len);

stat = my_regcomp(&data->expr, copy, parse_mode(mode), &my_charset_latin1);

free(copy);

if (stat) {

sprintf(message, "regcomp failed (error: %d)", stat);

return 1;

}

data->dynamic = 0;

} else {

data->dynamic = 1;

}

</init>

<?data

if (!data->dynamic) {

// free static compiler pattern

my_regfree(&data->expr);

}

</deinit>

<code>

<?data

my_regmatch_t match;

int stat;

char *copy;

if (data->dynamic) {

copy = strndup(pattern, pattern_len);

stat = my_regcomp(&data->expr, copy, parse_mode(mode), &my_charset_latin1);

free(copy);

if (stat) {

// TODO: need ERROR() and WARNING() macro

RETURN_NULL;

}

copy = strndup(text, text_len);

stat = my_regexec(&data->expr, copy, 1, &match, 0);

free(copy);

if (data->dynamic) {

my_regfree(&data->expr);

}

if (stat && (stat != REG_NOMATCH)) {

fprintf(stderr, "regexec error %d '%s' '%s'\n", stat, pattern, text);

RETURN_NULL;

}

RETURN_INT(stat == REG_NOMATCH ? 0 : 1);

100

101

102

</code>

103

104

<test>

105

<code>

106

# testing simple cases

107

SELECT REGEXP_LIKE("xxxabcxxx", ".*abc.*") AS r1;

108

SELECT REGEXP_LIKE("xxxabdxxx", ".*abc.*") AS r2;

109

</code>

110

111

112

113

114

115

</result>

116

117

<code>

118

# testing case sensitivity

119

SELECT REGEXP_LIKE("xxxABCxxx", ".*abc.*") AS r3;

120

SELECT REGEXP_LIKE("xxxABCxxx", ".*abc.*", "i") AS r4;

121

</code>

122

123

124

125

126

127

</result>

128

129

<code>

130

# testing POSIX character classes

131

SELECT REGEXP_LIKE("abcdef", "^[[:alpha:]]+$") AS r1;

132

SELECT REGEXP_LIKE("123456", "^[[:alpha:]]+$") AS r2;

133

SELECT REGEXP_LIKE("123abcdef", "^[[:xdigit:]]+$") AS r3;

134

</code>

135

136

137

138

139

140

141

142

</result>

143

144

</test>

145

146

</function>

147

148

149

150

151

152

153

<data>

154

155

156

</data>

157

158

<init>

159

<?data

160

if (pattern) {

161

// static regex pattern -> we can compile it once and reuse it

162

int stat;

163

char *copy;

164

165

// we have to make sure we have a NUL terminated C string

166

// as argument for my_regcomp

167

copy = strndup(pattern, pattern_len);

168

stat = my_regcomp(&data->expr, copy, REG_EXTENDED, &my_charset_latin1);

169

free(copy);

170

171

if (stat) {

172

sprintf(message, "regcomp failed (error: %d)", stat);

173

return 1;

174

}

175

176

data->dynamic = 0;

177

} else {

178

data->dynamic = 1;

179

}

180

181

</init>

182

183

184

<?data

185

if (!data->dynamic) {

186

// free static compiler pattern

187

my_regfree(&data->expr);

188

}

189

190

</deinit>

191

192

<code>

193

<?data

194

my_regmatch_t match;

195

int stat;

196

char *copy;

197

198

if (data->dynamic) {

199

copy = strndup(pattern, pattern_len);

200

stat = my_regcomp(&data->expr, copy, REG_EXTENDED, &my_charset_latin1);

201

free(copy);

202

if (stat) {

203

// TODO: need ERROR() and WARNING() macro

204

RETURN_NULL;

205

}

206

}

207

208

copy = strndup(text, text_len);

209

stat = my_regexec(&data->expr, copy, 1, &match, 0);

210

free(copy);

211

212

if (data->dynamic) {

213

my_regfree(&data->expr);

214

}

215

216

if (stat) {

217

if (stat != REG_NOMATCH) {

218

fprintf(stderr, "regexec error %d '%s' '%s'\n", stat, pattern, text);

219

}

220

RETURN_NULL;

221

}

222

223

RETURN_STRINGL(text + match.rm_so, match.rm_eo - match.rm_so);

224

225

226

</code>

227

<test>

228

<code>

229

SELECT REGEXP_SUBSTR("abc 123 def", "[[:digit:]]+") AS r1;

230

</code>

231

232

233

123

234

</result>

235

</test>

236

</function>

237

238

239

240

241

242

243

244

245

246

247

248

<data>

249

250

251

</data>

252

253

<init>

254

<?data

255

if (pattern) {

256

// static regex pattern -> we can compile it once and reuse it

257

int stat;

258

char *copy;

259

260

// we have to make sure we have a NUL terminated C string

261

// as argument for my_regcomp

262

copy = strndup(pattern, pattern_len);

263

stat = my_regcomp(&data->expr, copy, parse_mode(mode), &my_charset_latin1);

264

free(copy);

265

266

if (stat) {

267

sprintf(message, "regcomp failed (error: %d)", stat);

268

return 1;

269

}

270

271

data->dynamic = 0;

272

} else {

273

data->dynamic = 1;

274

}

275

276

</init>

277

278

279

<?data

280

if (!data->dynamic) {

281

// free static compiler pattern

282

my_regfree(&data->expr);

283

}

284

285

</deinit>

286

287

<code>

288

<?data

289

my_regmatch_t match;

290

int stat;

291

char *copy;

292

293

if (position) {

294

position -= 1; /* oracle offsets start at 1, not 0 */

295

}

296

297

if (data->dynamic) {

298

copy = strndup(pattern, pattern_len);

299

stat = my_regcomp(&data->expr, copy, parse_mode(mode), &my_charset_latin1);

300

free(copy);

301

if (stat) {

302

// TODO: need ERROR() and WARNING() macro

303

RETURN_NULL;

304

}

305

}

306

307

copy = strndup(text, text_len);

308

match.rm_eo = 0;

309

do {

310

position += match.rm_eo;

311

stat = my_regexec(&data->expr, copy + (size_t)position, 1, &match, 0);

312

} while ((stat == 0) && --occurrence > 0);

313

314

free(copy);

315

316

if (data->dynamic) {

317

my_regfree(&data->expr);

318

}

319

320

if (stat) {

321

fprintf(stderr, "regexec error %d '%s' '%s'\n", stat, pattern, text);

322

RETURN_NULL;

323

}

324

325

RETURN_INT(position + (return_end ? match.rm_eo : match.rm_so + 1));

326

327

</code>

328

<test>

329

<code>

330

SELECT REGEXP_INSTR("the quick brown fox jumps ...", "fox") AS r1;

331

</code>

332

333

334

335

</result>

336

<code>

337

SELECT REGEXP_INSTR("lala abc lala abc lala", "abc") AS r1;

338

SELECT REGEXP_INSTR("lala abc lala abc lala", "abc", 6) AS r2;

339

SELECT REGEXP_INSTR("lala abc lala abc lala", "abc", 7) AS r3;

340

</code>

341

342

343

344

345

346

347

348

</result>

349

<code>

350

SELECT REGEXP_INSTR("lala abc lala abc lala abc lala", "abc", 1, 1) AS r1;

351

SELECT REGEXP_INSTR("lala abc lala abc lala abc lala", "abc", 1, 2) AS r2;

352

SELECT REGEXP_INSTR("lala abc lala abc lala abc lala", "abc", 1, 3) AS r3;

353

SELECT REGEXP_INSTR("lala abc lala abc lala abc lala", "abc", 1, 4) AS r4;

354

</code>

355

356

357

358

359

360

361

362

363

NULL

364

</result>

365

<code>

366

# get character position of match start

367

SELECT REGEXP_INSTR("the quick brown fox jumps ...", "fox", 1, 1, 0) AS r1;

368

# get character position of match end

369

SELECT REGEXP_INSTR("the quick brown fox jumps ...", "fox", 1, 1, 1) AS r2;

370

# get character position of match end, use defauts for unused parameters

371

SELECT REGEXP_INSTR("the quick brown fox jumps ...", "fox", NULL, NULL, 1) AS r3;

372

</code>

373

374

375

376

377

378

379

380

</result>

381

</test>

382

</function>

383

384

385

386

387

388

389

390

<code>

391

char *c_pattern, *c_replace, *c_text;

392

char *result;

393

394

c_pattern = strndup(pattern, pattern_len);

395

c_replace = strndup(replace, replace_len);

396

c_text = strndup(text, text_len);

397

398

result = my_regex_replace(c_pattern, c_replace, c_text);

399

400

free(c_pattern);

401

free(c_replace);

402

free(c_text);

403

404

if (result) {

405

RETURN_STRING(result);

406

} else {

407

RETURN_NULL;

408

}

409

</code>

410

<test>

411

<code>

412

SELECT REGEXP_REPLACE("lala foo lala", "foo", "bar") AS r1;

413

</code>

414

415

416

lala bar lala

417

</result>

418

</test>

419

</function>

420

421

422

<![CDATA[

423

424

#if 0

425

#if MYSQL_VERSION_ID < 50000

426

#error need MySQL >= 5.0

427

#endif

428

#endif

429

430

#include <sys/types.h>

431

432

// TODO: my_regex.h is not installed by "make install"

433

#include <regex/my_regex.h>

434

435

// helper function borrowed from PHP, slightly modified

436

static char *my_regex_replace(const char *pattern, const char *replace, const char *string)

437

{

438

my_regex_t re;

439

my_regmatch_t *subs;

440

441

char *buf, /* buf is where we build the replaced string */

442

*nbuf, /* nbuf is used when we grow the buffer */

443

*walkbuf; /* used to walk buf when replacing backrefs */

444

const char *walk; /* used to walk replacement string for backrefs */

445

int buf_len;

446

int pos, tmp, string_len, new_l;

447

int err, copts = REG_EXTENDED;

448

449

string_len = strlen(string);

450

451

err = my_regcomp(&re, pattern, copts, &my_charset_latin1);

452

if (err) {

453

return NULL;

454

}

455

456

/* allocate storage for (sub-)expression-matches */

457

subs = (my_regmatch_t *)calloc(sizeof(my_regmatch_t),re.re_nsub+1);

458

459

/* start with a buffer that is twice the size of the stringo

460

we're doing replacements in */

461

buf_len = 2 * string_len + 1;

462

buf = calloc(buf_len, sizeof(char));

463

464

err = pos = 0;

465

buf[0] = '\0';

466

while (!err) {

467

err = my_regexec(&re, &string[pos], re.re_nsub+1, subs, (pos ? REG_NOTBOL : 0));

468

469

if (err && err != REG_NOMATCH) {

470

free(subs);

471

free(buf);

472

my_regfree(&re);

473

return NULL;

474

}

475

476

if (!err) {

477

/* backref replacement is done in two passes:

478

1) find out how long the string will be, and allocate buf

479

2) copy the part before match, replacement and backrefs to buf

480

481

Jaakko Hyv�tti <Jaakko.Hyvatti@iki.fi>

482

483

484

new_l = strlen(buf) + subs[0].rm_so; /* part before the match */

485

walk = replace;

486

while (*walk) {

487

if ('\\' == *walk && isdigit((unsigned char)walk[1]) && ((unsigned char)walk[1]) - '0' <= re.re_nsub) {

488

if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1) {

489

new_l += subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so;

490

}

491

walk += 2;

492

} else {

493

new_l++;

494

walk++;

495

}

496

}

497

if (new_l + 1 > buf_len) {

498

buf_len = 1 + buf_len + 2 * new_l;

499

nbuf = malloc(buf_len);

500

strcpy(nbuf, buf);

501

free(buf);

502

buf = nbuf;

503

}

504

tmp = strlen(buf);

505

/* copy the part of the string before the match */

506

strncat(buf, &string[pos], subs[0].rm_so);

507

508

/* copy replacement and backrefs */

509

walkbuf = &buf[tmp + subs[0].rm_so];

510

walk = replace;

511

while (*walk) {

512

if ('\\' == *walk && isdigit(walk[1]) && walk[1] - '0' <= (int)re.re_nsub) {

513

if (subs[walk[1] - '0'].rm_so > -1 && subs[walk[1] - '0'].rm_eo > -1

514

/* this next case shouldn't happen. it does. */

515

&& subs[walk[1] - '0'].rm_so <= subs[walk[1] - '0'].rm_eo) {

516

517

tmp = subs[walk[1] - '0'].rm_eo - subs[walk[1] - '0'].rm_so;

518

memcpy (walkbuf, &string[pos + subs[walk[1] - '0'].rm_so], tmp);

519

walkbuf += tmp;

520

}

521

walk += 2;

522

} else {

523

*walkbuf++ = *walk++;

524

}

525

}

526

*walkbuf = '\0';

527

528

/* and get ready to keep looking for replacements */

529

if (subs[0].rm_so == subs[0].rm_eo) {

530

if (subs[0].rm_so + pos >= string_len) {

531

break;

532

}

533

new_l = strlen (buf) + 1;

534

if (new_l + 1 > buf_len) {

535

buf_len = 1 + buf_len + 2 * new_l;

536

nbuf = calloc(buf_len, sizeof(char));

537

strcpy(nbuf, buf);

538

free(buf);

539

buf = nbuf;

540

}

541

pos += subs[0].rm_eo + 1;

542

buf [new_l-1] = string [pos-1];

543

buf [new_l] = '\0';

544

} else {

545

pos += subs[0].rm_eo;

546

}

547

} else { /* REG_NOMATCH */

548

new_l = strlen(buf) + strlen(&string[pos]);

549

if (new_l + 1 > buf_len) {

550

buf_len = new_l + 1; /* now we know exactly how long it is */

551

nbuf = calloc(buf_len, sizeof(char));

552

strcpy(nbuf, buf);

553

free(buf);

554

buf = nbuf;

555

}

556

/* stick that last bit of string on our output */

557

strcat(buf, &string[pos]);

558

}

559

}

560

561

/* don't want to leak memory .. */

562

free(subs);

563

my_regfree(&re);

564

565

/* whew. */

566

return (buf);

567

}

568

569

static int parse_mode(const char *mode)

570

{

571

int flags = REG_EXTENDED | REG_NEWLINE;

572

573

if (mode) {

574

do {

575

switch (*mode++) {

576

case 'i': flags |= REG_ICASE; break; /* case insensitive */

577

case 'c': flags &= ~REG_ICASE; break; /* case sensitive */

578

case 'n': break; /* . matches newline */

579

case 'm': break; /* multiple lines */

580

case 'x': break; /* ignore whitespace */

581

default: break;

582

}

583

} while (*mode != '\0');

584

}

585

586

fprintf(stderr, "flags are %X\n", flags);

587

588

return flags;

589

}

590

591

592

]]>

593

</code>

594

595

</extension>

Older »