~ubuntu-branches/ubuntu/hardy/exim4/hardy-proposed

Viewing changes to src/pcre/pcre_compile.c

Committer: Bazaar Package Importer
Author(s): Marc Haber
Date: 2005-07-02 06:08:34 UTC
mfrom: (1.1.1 upstream)
Revision ID: james.westby@ubuntu.com-20050702060834-qk17pd52kb9nt3bj

Tags: 4.52-1

http://bugs.debian.org/315775

* new upstream version 4.51. (mh)
  * adapt 70_remove_exim-users_references
  * remove 37_gnutlsparams
  * adapt 36_pcre
  * adapt 31_eximmanpage
* fix package priorities to have them in sync with override again. (mh)
* Fix error in nb (Norwegian) translation.
  Thanks to Helge Hafting. (mh). Closes: #315775
* Standards-Version: 3.6.2, no changes needed. (mh)

files added:
OS/Makefile-GNUkFreeBSD

OS/Makefile-GNUkNetBSD

OS/os.h-GNUkFreeBSD

OS/os.h-GNUkNetBSD

debian/config-custom/debian/install

debian/linda

debian/linda-overrides

debian/linda/overrides

debian/linda/overrides/exim4-daemon-heavy

debian/linda/overrides/exim4-daemon-light

debian/lintian

debian/lintian/overrides

debian/lintian/overrides/exim4-config

debian/lintian/overrides/exim4-daemon-heavy

debian/lintian/overrides/exim4-daemon-light

debian/patches/60_convert4r4.dpatch

debian/patches/70_remove_exim-users_references.dpatch

debian/po/et.po

debian/po/gl.po

debian/po/tl.po

debian/po/vi.po

doc/experimental-spec.txt

src/auths/cyrus_sasl.c

src/auths/cyrus_sasl.h

src/bmi_spam.c

src/bmi_spam.h

src/demime.c

src/demime.h

src/dk.c

src/dk.h

src/lookups/lf_quote.c

src/lookups/spf.c

src/lookups/spf.h

src/malware.c

src/mime.c

src/mime.h

src/pcre/pcre_compile.c

src/pcre/pcre_config.c

src/pcre/pcre_exec.c

src/pcre/pcre_fullinfo.c

src/pcre/pcre_get.c

src/pcre/pcre_globals.c

src/pcre/pcre_internal.h

src/pcre/pcre_maketables.c

src/pcre/pcre_printint.c

src/pcre/pcre_study.c

src/pcre/pcre_tables.c

src/pcre/pcre_try_flipped.c

src/pcre/pcre_version.c

src/pcre/ucp.h

src/regex.c

src/spam.c

src/spam.h

src/spf.c

src/spf.h

src/spool_mbox.c

src/srs.c

src/srs.h

util/README

util/mkcdb.pl

files removed:
OS/Makefile-Linux-libc5

OS/os.c-Linux-libc5

OS/os.h-Linux-libc5

debian/README.TLS

debian/config-custom/debian/files

debian/debconf/30_exim4-config_example_check_rcpt

debian/exim4-config-medium

debian/exim4-config-medium/debian

debian/exim4-config-medium/debian/changelog

debian/exim4-config-medium/debian/compat

debian/exim4-config-medium/debian/config

debian/exim4-config-medium/debian/config/30_exim4-config_example_check_rcpt

debian/exim4-config-medium/debian/config/conf.d

debian/exim4-config-medium/debian/config/conf.d/30_exim4-config-medium_example_check_rcpt

debian/exim4-config-medium/debian/config/conf.d/conf.d

debian/exim4-config-medium/debian/config/conf.d/conf.d/acl

debian/exim4-config-medium/debian/config/conf.d/conf.d/acl/00_exim4-config-medium_header

debian/exim4-config-medium/debian/config/conf.d/conf.d/acl/20_exim4-config-medium_whitelist_local_deny

debian/exim4-config-medium/debian/config/conf.d/conf.d/acl/30_exim4-config-medium_check_rcpt

debian/exim4-config-medium/debian/config/conf.d/conf.d/acl/40_exim4-config-medium_check_data

debian/exim4-config-medium/debian/config/conf.d/conf.d/auth

debian/exim4-config-medium/debian/config/conf.d/conf.d/auth/00_exim4-config-medium_header

debian/exim4-config-medium/debian/config/conf.d/conf.d/auth/30_exim4-config-medium_examples

debian/exim4-config-medium/debian/config/conf.d/conf.d/main

debian/exim4-config-medium/debian/config/conf.d/conf.d/main/01_exim4-config-medium_listmacrosdefs

debian/exim4-config-medium/debian/config/conf.d/conf.d/main/02_exim4-config-medium_options

debian/exim4-config-medium/debian/config/conf.d/conf.d/main/03_exim4-config-medium_tlsoptions

debian/exim4-config-medium/debian/config/conf.d/conf.d/retry

debian/exim4-config-medium/debian/config/conf.d/conf.d/retry/00_exim4-config-medium_header

debian/exim4-config-medium/debian/config/conf.d/conf.d/retry/30_exim4-config-medium

debian/exim4-config-medium/debian/config/conf.d/conf.d/rewrite

debian/exim4-config-medium/debian/config/conf.d/conf.d/rewrite/00_exim4-config-medium_header

debian/exim4-config-medium/debian/config/conf.d/conf.d/rewrite/31_exim4-config-medium_rewriting

debian/exim4-config-medium/debian/config/conf.d/conf.d/router

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/00_exim4-config-medium_header

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/100_exim4-config-medium_domain_literal

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/200_exim4-config-medium_primary

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/300_exim4-config-medium_real_local

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/400_exim4-config-medium_system_aliases

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/500_exim4-config-medium_hubuser

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/600_exim4-config-medium_userforward

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/700_exim4-config-medium_procmail

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/800_exim4-config-medium_maildrop

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/900_exim4-config-medium_local_user

debian/exim4-config-medium/debian/config/conf.d/conf.d/router/mmm_mail4root

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/00_exim4-config-medium_header

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/30_exim4-config-medium_address_file

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/30_exim4-config-medium_address_pipe

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/30_exim4-config-medium_address_reply

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/30_exim4-config-medium_mail_spool

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/30_exim4-config-medium_maildir_home

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/30_exim4-config-medium_maildrop_pipe

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/30_exim4-config-medium_procmail_pipe

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/30_exim4-config-medium_remote_smtp

debian/exim4-config-medium/debian/config/conf.d/conf.d/transport/35_exim4-config-medium_address_directory

debian/exim4-config-medium/debian/config/conf.d/default_acl

debian/exim4-config-medium/debian/config/default_acl

debian/exim4-config-medium/debian/config/update-exim4.conf

debian/exim4-config-medium/debian/control

debian/exim4-config-medium/debian/copyright

debian/exim4-config-medium/debian/email-addresses

debian/exim4-config-medium/debian/exim4-config-medium.dirs

debian/exim4-config-medium/debian/exim4-config-medium.manpages

debian/exim4-config-medium/debian/exim4-config-medium.postinst

debian/exim4-config-medium/debian/exim4-config-medium.postrm

debian/exim4-config-medium/debian/ip-up.d

debian/exim4-config-medium/debian/manpages

debian/exim4-config-medium/debian/manpages/update-exim4.conf.8

debian/exim4-config-medium/debian/manpages/update-exim4defaults.8

debian/exim4-config-medium/debian/rules

debian/exim4-config-medium/debian/update-exim4.conf.conf

debian/exim4-config-medium/debian/update-exim4defaults

debian/exim4-config-simple

debian/exim4-config-simple/debian

debian/exim4-config-simple/debian/changelog

debian/exim4-config-simple/debian/compat

debian/exim4-config-simple/debian/control

debian/exim4-config-simple/debian/copyright

debian/exim4-config-simple/debian/debconf

debian/exim4-config-simple/debian/debconf/update-exim4.conf

debian/exim4-config-simple/debian/exim4-config-simple.dirs

debian/exim4-config-simple/debian/exim4-config-simple.manpages

debian/exim4-config-simple/debian/exim4-config-simple.postinst

debian/exim4-config-simple/debian/exim4-config-simple.postrm

debian/exim4-config-simple/debian/exim4.conf.defaults

debian/exim4-config-simple/debian/exim4.conf.source

debian/exim4-config-simple/debian/manpages

debian/exim4-config-simple/debian/manpages/update-exim4.conf.8

debian/exim4-config-simple/debian/rules

debian/patches/10_daemon_close_fds.dpatch

debian/patches/60_upstream_fixes.dpatch

debian/patches/61_queryprogramrouter.dpatch

debian/patches/62_statvfs.dpatch

debian/patches/63_nomorecrashongnutlserror.dpatch

debian/patches/64_pipeliningfixup.dpatch

debian/patches/65_tidydb-splitspool.dpatch

debian/patches/66_can2005-0021_can2005-0022.dpatch

debian/patches/exiscan.patch

files modified:
ACKNOWLEDGMENTS

CHANGES

LICENCE

Makefile

NOTICE

OS/Makefile-AIX

OS/Makefile-BSDI

OS/Makefile-Base

OS/Makefile-CYGWIN

OS/Makefile-DGUX

OS/Makefile-Darwin

OS/Makefile-Default

OS/Makefile-FreeBSD

OS/Makefile-GNU

OS/Makefile-HI-OSF

OS/Makefile-HI-UX

OS/Makefile-HP-UX

OS/Makefile-HP-UX-9

OS/Makefile-IRIX

OS/Makefile-IRIX6

OS/Makefile-IRIX632

OS/Makefile-IRIX65

OS/Makefile-Linux

OS/Makefile-NetBSD

OS/Makefile-NetBSD-a.out

OS/Makefile-OSF1

OS/Makefile-OpenBSD

OS/Makefile-OpenUNIX

OS/Makefile-QNX

OS/Makefile-SCO

OS/Makefile-SCO_SV

OS/Makefile-SunOS4

OS/Makefile-SunOS5

OS/Makefile-SunOS5-hal

OS/Makefile-ULTRIX

OS/Makefile-UNIX_SV

OS/Makefile-USG

OS/Makefile-Unixware7

OS/Makefile-mips

OS/eximon.conf-Default

OS/os.Configuring

OS/os.c-GNU

OS/os.c-HI-OSF

OS/os.c-IRIX

OS/os.c-IRIX6

OS/os.c-IRIX632

OS/os.c-IRIX65

OS/os.c-Linux

OS/os.c-OSF1

OS/os.c-cygwin

OS/os.h-AIX

OS/os.h-BSDI

OS/os.h-DGUX

OS/os.h-Darwin

OS/os.h-FreeBSD

OS/os.h-GNU

OS/os.h-HI-OSF

OS/os.h-HI-UX

OS/os.h-HP-UX

OS/os.h-HP-UX-9

OS/os.h-IRIX

OS/os.h-IRIX6

OS/os.h-IRIX632

OS/os.h-IRIX65

OS/os.h-Linux

OS/os.h-NetBSD

OS/os.h-NetBSD-a.out

OS/os.h-OSF1

OS/os.h-OpenBSD

OS/os.h-OpenUNIX

OS/os.h-QNX

OS/os.h-SCO

OS/os.h-SCO_SV

OS/os.h-SunOS4

OS/os.h-SunOS5

OS/os.h-SunOS5-hal

OS/os.h-ULTRIX

OS/os.h-UNIX_SV

OS/os.h-USG

OS/os.h-Unixware7

OS/os.h-cygwin

OS/os.h-mips

README

README.UPDATING

debian/EDITME.exim4-heavy.diff

debian/EDITME.exim4-light.diff

debian/EDITME.eximon.diff

debian/README.Debian

debian/README.Debian-accountname

debian/README.Debian.UUCP

debian/README.Debian.xinetd

debian/README.SMTP-AUTH

debian/README.system_aliases

debian/TODO

debian/changelog

debian/config-custom/create-custom-config-package

debian/config-custom/debian/rules

debian/control

debian/create-custom-package

debian/debconf/conf.d/acl/20_exim4-config_whitelist_local_deny

debian/debconf/conf.d/acl/30_exim4-config_check_rcpt

debian/debconf/conf.d/acl/40_exim4-config_check_data

debian/debconf/conf.d/auth/30_exim4-config_examples

debian/debconf/conf.d/main/01_exim4-config_listmacrosdefs

debian/debconf/conf.d/main/02_exim4-config_options

debian/debconf/conf.d/main/03_exim4-config_tlsoptions

debian/debconf/conf.d/rewrite/31_exim4-config_rewriting

debian/debconf/conf.d/router/100_exim4-config_domain_literal

debian/debconf/conf.d/router/300_exim4-config_real_local

debian/debconf/conf.d/router/400_exim4-config_system_aliases

debian/debconf/conf.d/router/500_exim4-config_hubuser

debian/debconf/conf.d/transport/30_exim4-config_maildir_home

debian/debconf/default_acl

debian/debconf/update-exim4.conf

debian/debconf/update-exim4.conf.template

debian/exim-gencert

debian/exim4-base.config

debian/exim4-base.cron.daily

debian/exim4-base.docs

debian/exim4-base.init

debian/exim4-base.postinst

debian/exim4-base.postrm

debian/exim4-base.templates

debian/exim4-config.NEWS

debian/exim4-config.config

debian/exim4-config.docs

debian/exim4-config.install

debian/exim4-config.postinst

debian/exim4-config.postrm

debian/exim4-config.templates

debian/exim4-config.templates.master

debian/exim4-daemon-custom.links

debian/exim4-daemon-heavy.install

debian/exim4-daemon-heavy.links

debian/exim4-daemon-light.install

debian/exim4-daemon-light.links

debian/exim4-daemon-light.postinst

debian/exim4-daemon-light.prerm

debian/ip-up.d

debian/manpages/exiwhat.8

debian/manpages/update-exim4.conf.8

debian/manpages/update-exim4.conf.template.8

debian/patches/00list

debian/patches/31_eximmanpage.dpatch

debian/patches/36_pcre.dpatch

debian/po/ar.po

debian/po/bg.po

debian/po/bs.po

debian/po/ca.po

debian/po/cs.po

debian/po/cy.po

debian/po/da.po

debian/po/de.po

debian/po/el.po

debian/po/es.po

debian/po/eu.po

debian/po/fi.po

debian/po/fr.po

debian/po/he.po

debian/po/hr.po

debian/po/hu.po

debian/po/id.po

debian/po/it.po

debian/po/ja.po

debian/po/ko.po

debian/po/lt.po

debian/po/mk.po

debian/po/nb.po

debian/po/nl.po

debian/po/nn.po

debian/po/pl.po

debian/po/pt.po

debian/po/pt_BR.po

debian/po/ro.po

debian/po/ru.po

debian/po/sk.po

debian/po/sl.po

debian/po/sq.po

debian/po/sv.po

debian/po/templates.pot

debian/po/tr.po

debian/po/uk.po

debian/po/zh_CN.po

debian/po/zh_TW.po

debian/rules

debian/script

debian/update-exim4defaults

doc/ChangeLog

doc/Exim3.upgrade

doc/Exim4.upgrade

doc/NewStuff

doc/OptionLists.txt

doc/README

doc/README.SIEVE

doc/dbm.discuss.txt

doc/exim.8

doc/filter.txt

doc/pcrepattern.txt

doc/pcretest.txt

doc/spec.txt

exim_monitor/EDITME

exim_monitor/em_StripChart.c

exim_monitor/em_TextPop.c

exim_monitor/em_globals.c

exim_monitor/em_hdr.h

exim_monitor/em_init.c

exim_monitor/em_log.c

exim_monitor/em_main.c

exim_monitor/em_menu.c

exim_monitor/em_queue.c

exim_monitor/em_strip.c

exim_monitor/em_text.c

exim_monitor/em_version.c

exim_monitor/em_xs.c

scripts/Configure

scripts/Configure-Makefile

scripts/Configure-config.h

scripts/Configure-eximon

scripts/Configure-os.c

scripts/Configure-os.h

scripts/MakeLinks

scripts/arch-type

scripts/exim_install

scripts/newer

scripts/os-type

src/EDITME

src/acl.c

src/aliases.default

src/auths/Makefile

src/auths/README

src/auths/auth-spa.c

src/auths/auth-spa.h

src/auths/b64decode.c

src/auths/b64encode.c

src/auths/call_pam.c

src/auths/call_pwcheck.c

src/auths/call_radius.c

src/auths/cram_md5.c

src/auths/cram_md5.h

src/auths/get_data.c

src/auths/get_no64_data.c

src/auths/md5.c

src/auths/plaintext.c

src/auths/plaintext.h

src/auths/pwcheck.c

src/auths/pwcheck.h

src/auths/sha1.c

src/auths/spa.c

src/auths/spa.h

src/auths/xtextdecode.c

src/auths/xtextencode.c

src/buildconfig.c

src/child.c

src/config.h.defaults

src/configure.default

src/convert4r3.src

src/convert4r4.src

src/crypt16.c

src/daemon.c

src/dbfn.c

src/dbfunctions.h

src/dbstuff.h

src/debug.c

src/deliver.c

src/directory.c

src/dns.c

src/drtables.c

src/dummies.c

src/enq.c

src/exicyclog.src

src/exigrep.src

src/exim.c

src/exim.h

src/exim_checkaccess.src

src/exim_dbmbuild.c

src/exim_dbutil.c

src/exim_lock.c

src/eximon.src

src/eximstats.src

src/exinext.src

src/exipick.src

src/exiqgrep.src

src/exiqsumm.src

src/exiwhat.src

src/expand.c

src/filter.c

src/filtertest.c

src/functions.h

src/globals.c

src/globals.h

src/header.c

src/host.c

src/ip.c

src/local_scan.c

src/local_scan.h

src/log.c

src/lookups/Makefile

src/lookups/README

src/lookups/cdb.c

src/lookups/cdb.h

src/lookups/dbmdb.c

src/lookups/dbmdb.h

src/lookups/dnsdb.c

src/lookups/dnsdb.h

src/lookups/dsearch.c

src/lookups/dsearch.h

src/lookups/ibase.c

src/lookups/ibase.h

src/lookups/ldap.c

src/lookups/ldap.h

src/lookups/lf_check_file.c

src/lookups/lf_functions.h

src/lookups/lsearch.c

src/lookups/lsearch.h

src/lookups/mysql.c

src/lookups/mysql.h

src/lookups/nis.c

src/lookups/nis.h

src/lookups/nisplus.c

src/lookups/nisplus.h

src/lookups/oracle.c

src/lookups/oracle.h

src/lookups/passwd.c

src/lookups/passwd.h

src/lookups/pgsql.c

src/lookups/pgsql.h

src/lookups/testdb.c

src/lookups/testdb.h

src/lookups/whoson.c

src/lookups/whoson.h

src/lss.c

src/macros.h

src/match.c

src/moan.c

src/mytypes.h

src/os.c

src/osfunctions.h

src/parse.c

src/pcre/ChangeLog

src/pcre/LICENCE

src/pcre/Makefile

src/pcre/README

src/pcre/config.h

src/pcre/dftables.c

src/pcre/get.c

src/pcre/internal.h

src/pcre/maketables.c

src/pcre/pcre.c

src/pcre/pcre.h

src/pcre/pcretest.c

src/pcre/printint.c

src/pcre/study.c

src/perl.c

src/queue.c

src/rda.c

src/readconf.c

src/receive.c

src/retry.c

src/rewrite.c

src/rfc2047.c

src/route.c

src/routers/Makefile

src/routers/README

src/routers/accept.c

src/routers/accept.h

src/routers/dnslookup.c

src/routers/dnslookup.h

src/routers/ipliteral.c

src/routers/ipliteral.h

src/routers/iplookup.c

src/routers/iplookup.h

src/routers/manualroute.c

src/routers/manualroute.h

src/routers/queryprogram.c

src/routers/queryprogram.h

src/routers/redirect.c

src/routers/redirect.h

src/routers/rf_change_domain.c

src/routers/rf_expand_data.c

src/routers/rf_functions.h

src/routers/rf_get_errors_address.c

src/routers/rf_get_munge_headers.c

src/routers/rf_get_transport.c

src/routers/rf_get_ugid.c

src/routers/rf_lookup_hostlist.c

src/routers/rf_queue_add.c

src/routers/rf_self_action.c

src/routers/rf_set_ugid.c

src/search.c

src/sieve.c

src/smtp_in.c

src/smtp_out.c

src/spool_in.c

src/spool_out.c

src/store.c

src/store.h

src/string.c

src/structs.h

src/tls-gnu.c

src/tls-openssl.c

src/tls.c

src/tod.c

src/transport-filter.src

src/transport.c

src/transports/Makefile

src/transports/README

src/transports/appendfile.c

src/transports/appendfile.h

src/transports/autoreply.c

src/transports/autoreply.h

src/transports/lmtp.c

src/transports/lmtp.h

src/transports/pipe.c

src/transports/pipe.h

src/transports/smtp.c

src/transports/smtp.h

src/transports/tf_maildir.c

src/transports/tf_maildir.h

src/tree.c

src/verify.c

src/version.c

util/cramtest.pl

util/logargs.sh

util/unknownuser.sh

Show diffs side-by-side

added added

removed removed

src/pcre/pcre_compile.c

/* $Cambridge: exim/exim-src/src/pcre/pcre_compile.c,v 1.1 2005/06/15 08:57:10 ph10 Exp $ */

/*************************************************

* Perl-Compatible Regular Expressions *

*************************************************/

/* PCRE is a library of functions to support regular expressions whose syntax

and semantics are as close as possible to those of the Perl 5 language.

Written by Philip Hazel

-----------------------------------------------------------------------------

Redistribution and use in source and binary forms, with or without

modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice,

this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright

notice, this list of conditions and the following disclaimer in the

documentation and/or other materials provided with the distribution.

* Neither the name of the University of Cambridge nor the names of its

contributors may be used to endorse or promote products derived from

this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

POSSIBILITY OF SUCH DAMAGE.

-----------------------------------------------------------------------------

/* This module contains the external function pcre_compile(), along with

supporting internal functions that are not used by other modules. */

#include "pcre_internal.h"

/*************************************************

* Code parameters and static tables *

*************************************************/

/* Maximum number of items on the nested bracket stacks at compile time. This

applies to the nesting of all kinds of parentheses. It does not limit

un-nested, non-capturing parentheses. This number can be made bigger if

necessary - it is used to dimension one int and one unsigned char vector at

compile time. */

#define BRASTACK_SIZE 200

/* Table for handling escaped characters in the range '0'-'z'. Positive returns

are simple data values; negative values are for special things like \d and so

on. Zero means further processing is needed (for things like \x), or the escape

is invalid. */

#if !EBCDIC /* This is the "normal" table for ASCII systems */

static const short int escapes[] = {

0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */

0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */

'@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */

0, 0, 0, 0, 0, 0, 0, 0, /* H - O */

-ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */

-ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */

'`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */

0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */

-ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */

0, 0, -ESC_z /* x - z */

};

#else /* This is the "abnormal" table for EBCDIC systems */

static const short int escapes[] = {

/* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',

/* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,

/* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',

/* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,

/* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',

/* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,

/* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',

/* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,

/* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,

/* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,

/* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,

/* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,

/* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,

/* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,

/* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',

/* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,

100

/* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,

101

/* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,

102

/* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,

103

/* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,

104

/* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,

105

/* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,

106

/* F8 */ 0, 0, 0, 0, 0, 0, 0, 0

107

};

108

#endif

109

110

111

/* Tables of names of POSIX character classes and their lengths. The list is

112

terminated by a zero length entry. The first three must be alpha, upper, lower,

113

as this is assumed for handling case independence. */

114

115

static const char *const posix_names[] = {

116

"alpha", "lower", "upper",

117

"alnum", "ascii", "blank", "cntrl", "digit", "graph",

118

"print", "punct", "space", "word", "xdigit" };

119

120

static const uschar posix_name_lengths[] = {

121

5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };

122

123

/* Table of class bit maps for each POSIX class; up to three may be combined

124

to form the class. The table for [:blank:] is dynamically modified to remove

125

the vertical space characters. */

126

127

static const int posix_class_maps[] = {

128

cbit_lower, cbit_upper, -1, /* alpha */

129

cbit_lower, -1, -1, /* lower */

130

cbit_upper, -1, -1, /* upper */

131

cbit_digit, cbit_lower, cbit_upper, /* alnum */

132

cbit_print, cbit_cntrl, -1, /* ascii */

133

cbit_space, -1, -1, /* blank - a GNU extension */

134

cbit_cntrl, -1, -1, /* cntrl */

135

cbit_digit, -1, -1, /* digit */

136

cbit_graph, -1, -1, /* graph */

137

cbit_print, -1, -1, /* print */

138

cbit_punct, -1, -1, /* punct */

139

cbit_space, -1, -1, /* space */

140

cbit_word, -1, -1, /* word - a Perl extension */

141

cbit_xdigit,-1, -1 /* xdigit */

142

};

143

144

145

/* The texts of compile-time error messages. These are "char *" because they

146

are passed to the outside world. */

147

148

static const char *error_texts[] = {

149

"no error",

150

"\\ at end of pattern",

151

"\\c at end of pattern",

152

"unrecognized character follows \\",

153

"numbers out of order in {} quantifier",

154

/* 5 */

155

"number too big in {} quantifier",

156

"missing terminating ] for character class",

157

"invalid escape sequence in character class",

158

"range out of order in character class",

159

"nothing to repeat",

160

/* 10 */

161

"operand of unlimited repeat could match the empty string",

162

"internal error: unexpected repeat",

163

"unrecognized character after (?",

164

"POSIX named classes are supported only within a class",

165

"missing )",

166

/* 15 */

167

"reference to non-existent subpattern",

168

"erroffset passed as NULL",

169

"unknown option bit(s) set",

170

"missing ) after comment",

171

"parentheses nested too deeply",

172

/* 20 */

173

"regular expression too large",

174

"failed to get memory",

175

"unmatched parentheses",

176

"internal error: code overflow",

177

"unrecognized character after (?<",

178

/* 25 */

179

"lookbehind assertion is not fixed length",

180

"malformed number after (?(",

181

"conditional group contains more than two branches",

182

"assertion expected after (?(",

183

"(?R or (?digits must be followed by )",

184

/* 30 */

185

"unknown POSIX class name",

186

"POSIX collating elements are not supported",

187

"this version of PCRE is not compiled with PCRE_UTF8 support",

188

"spare error",

189

"character value in \\x{...} sequence is too large",

190

/* 35 */

191

"invalid condition (?(0)",

192

"\\C not allowed in lookbehind assertion",

193

"PCRE does not support \\L, \\l, \\N, \\U, or \\u",

194

"number after (?C is > 255",

195

"closing ) for (?C expected",

196

/* 40 */

197

"recursive call could loop indefinitely",

198

"unrecognized character after (?P",

199

"syntax error after (?P",

200

"two named groups have the same name",

201

"invalid UTF-8 string",

202

/* 45 */

203

"support for \\P, \\p, and \\X has not been compiled",

204

"malformed \\P or \\p sequence",

205

"unknown property name after \\P or \\p"

206

};

207

208

209

/* Table to identify digits and hex digits. This is used when compiling

210

patterns. Note that the tables in chartables are dependent on the locale, and

211

may mark arbitrary characters as digits - but the PCRE compiling code expects

212

to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have

213

a private table here. It costs 256 bytes, but it is a lot faster than doing

214

character value tests (at least in some simple cases I timed), and in some

215

applications one wants PCRE to compile efficiently as well as match

216

efficiently.

217

218

For convenience, we use the same bit definitions as in chartables:

219

220

0x04 decimal digit

221

0x08 hexadecimal digit

222

223

Then we can use ctype_digit and ctype_xdigit in the code. */

224

225

#if !EBCDIC /* This is the "normal" case, for ASCII systems */

226

static const unsigned char digitab[] =

227

{

228

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */

229

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */

230

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */

231

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */

232

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */

233

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */

234

0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */

235

0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */

236

0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */

237

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */

238

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */

239

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */

240

0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */

241

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */

242

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */

243

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */

244

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */

245

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */

246

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */

247

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */

248

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */

249

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */

250

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */

251

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */

252

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */

253

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */

254

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */

255

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */

256

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */

257

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */

258

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */

259

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */

260

261

#else /* This is the "abnormal" case, for EBCDIC systems */

262

static const unsigned char digitab[] =

263

{

264

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */

265

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */

266

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */

267

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */

268

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */

269

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */

270

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */

271

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */

272

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */

273

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */

274

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */

275

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- � */

276

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */

277

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */

278

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */

279

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */

280

0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */

281

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */

282

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */

283

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */

284

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */

285

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */

286

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */

287

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */

288

0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */

289

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */

290

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */

291

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */

292

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */

293

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */

294

0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */

295

0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */

296

297

static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */

298

0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */

299

0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */

300

0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */

301

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */

302

0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */

303

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */

304

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */

305

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */

306

0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */

307

0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */

308

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */

309

0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- � */

310

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */

311

0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */

312

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */

313

0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */

314

0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */

315

0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */

316

0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */

317

0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */

318

0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */

319

0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */

320

0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */

321

0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */

322

0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */

323

0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */

324

0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */

325

0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */

326

0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */

327

0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */

328

0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */

329

0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */

330

#endif

331

332

333

/* Definition to allow mutual recursion */

334

335

static BOOL

336

compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,

337

int *, int *, branch_chain *, compile_data *);

338

339

340

341

/*************************************************

342

* Handle escapes *

343

*************************************************/

344

345

/* This function is called when a \ has been encountered. It either returns a

346

positive value for a simple escape such as \n, or a negative value which

347

encodes one of the more complicated things such as \d. When UTF-8 is enabled,

348

a positive value greater than 255 may be returned. On entry, ptr is pointing at

349

the \. On exit, it is on the final character of the escape sequence.

350

351

Arguments:

352

ptrptr points to the pattern position pointer

353

errorcodeptr points to the errorcode variable

354

bracount number of previous extracting brackets

355

options the options bits

356

isclass TRUE if inside a character class

357

358

Returns: zero or positive => a data character

359

negative => a special escape sequence

360

on error, errorptr is set

361

362

363

static int

364

check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,

365

int options, BOOL isclass)

366

{

367

const uschar *ptr = *ptrptr;

368

int c, i;

369

370

/* If backslash is at the end of the pattern, it's an error. */

371

372

c = *(++ptr);

373

if (c == 0) *errorcodeptr = ERR1;

374

375

/* Non-alphamerics are literals. For digits or letters, do an initial lookup in

376

a table. A non-zero result is something that can be returned immediately.

377

Otherwise further processing may be required. */

378

379

#if !EBCDIC /* ASCII coding */

380

else if (c < '0' || c > 'z') {} /* Not alphameric */

381

else if ((i = escapes[c - '0']) != 0) c = i;

382

383

#else /* EBCDIC coding */

384

else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */

385

else if ((i = escapes[c - 0x48]) != 0) c = i;

386

#endif

387

388

/* Escapes that need further processing, or are illegal. */

389

390

else

391

{

392

const uschar *oldptr;

393

switch (c)

394

{

395

/* A number of Perl escapes are not handled by PCRE. We give an explicit

396

error. */

397

398

case 'l':

399

case 'L':

400

case 'N':

401

case 'u':

402

case 'U':

403

*errorcodeptr = ERR37;

404

break;

405

406

/* The handling of escape sequences consisting of a string of digits

407

starting with one that is not zero is not straightforward. By experiment,

408

the way Perl works seems to be as follows:

409

410

Outside a character class, the digits are read as a decimal number. If the

411

number is less than 10, or if there are that many previous extracting

412

left brackets, then it is a back reference. Otherwise, up to three octal

413

digits are read to form an escaped byte. Thus \123 is likely to be octal

414

123 (cf \0123, which is octal 012 followed by the literal 3). If the octal

415

value is greater than 377, the least significant 8 bits are taken. Inside a

416

character class, \ followed by a digit is always an octal number. */

417

418

case '1': case '2': case '3': case '4': case '5':

419

case '6': case '7': case '8': case '9':

420

421

if (!isclass)

422

{

423

oldptr = ptr;

424

c -= '0';

425

while ((digitab[ptr[1]] & ctype_digit) != 0)

426

c = c * 10 + *(++ptr) - '0';

427

if (c < 10 || c <= bracount)

428

{

429

c = -(ESC_REF + c);

430

break;

431

}

432

ptr = oldptr; /* Put the pointer back and fall through */

433

}

434

435

/* Handle an octal number following \. If the first digit is 8 or 9, Perl

436

generates a binary zero byte and treats the digit as a following literal.

437

Thus we have to pull back the pointer by one. */

438

439

if ((c = *ptr) >= '8')

440

{

441

ptr--;

442

c = 0;

443

break;

444

}

445

446

/* \0 always starts an octal number, but we may drop through to here with a

447

larger first octal digit. */

448

449

case '0':

450

c -= '0';

451

while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')

452

c = c * 8 + *(++ptr) - '0';

453

c &= 255; /* Take least significant 8 bits */

454

break;

455

456

/* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number

457

which can be greater than 0xff, but only if the ddd are hex digits. */

458

459

case 'x':

460

#ifdef SUPPORT_UTF8

461

if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)

462

{

463

const uschar *pt = ptr + 2;

464

465

c = 0;

466

while ((digitab[*pt] & ctype_xdigit) != 0)

467

{

468

int cc = *pt++;

469

count++;

470

#if !EBCDIC /* ASCII coding */

471

if (cc >= 'a') cc -= 32; /* Convert to upper case */

472

c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));

473

#else /* EBCDIC coding */

474

if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */

475

c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));

476

#endif

477

}

478

if (*pt == '}')

479

{

480

if (c < 0 || count > 8) *errorcodeptr = ERR34;

481

ptr = pt;

482

break;

483

}

484

/* If the sequence of hex digits does not end with '}', then we don't

485

recognize this construct; fall through to the normal \x handling. */

486

}

487

#endif

488

489

/* Read just a single hex char */

490

491

c = 0;

492

while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)

493

{

494

int cc; /* Some compilers don't like ++ */

495

cc = *(++ptr); /* in initializers */

496

#if !EBCDIC /* ASCII coding */

497

if (cc >= 'a') cc -= 32; /* Convert to upper case */

498

c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));

499

#else /* EBCDIC coding */

500

if (cc <= 'z') cc += 64; /* Convert to upper case */

501

c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));

502

#endif

503

}

504

break;

505

506

/* Other special escapes not starting with a digit are straightforward */

507

508

case 'c':

509

c = *(++ptr);

510

if (c == 0)

511

{

512

*errorcodeptr = ERR2;

513

return 0;

514

}

515

516

/* A letter is upper-cased; then the 0x40 bit is flipped. This coding

517

is ASCII-specific, but then the whole concept of \cx is ASCII-specific.

518

(However, an EBCDIC equivalent has now been added.) */

519

520

#if !EBCDIC /* ASCII coding */

521

if (c >= 'a' && c <= 'z') c -= 32;

522

c ^= 0x40;

523

#else /* EBCDIC coding */

524

if (c >= 'a' && c <= 'z') c += 64;

525

c ^= 0xC0;

526

#endif

527

break;

528

529

/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any

530

other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,

531

for Perl compatibility, it is a literal. This code looks a bit odd, but

532

there used to be some cases other than the default, and there may be again

533

in future, so I haven't "optimized" it. */

534

535

default:

536

if ((options & PCRE_EXTRA) != 0) switch(c)

537

{

538

default:

539

*errorcodeptr = ERR3;

540

break;

541

}

542

break;

543

}

544

}

545

546

*ptrptr = ptr;

547

return c;

548

}

549

550

551

552

#ifdef SUPPORT_UCP

553

/*************************************************

554

* Handle \P and \p *

555

*************************************************/

556

557

/* This function is called after \P or \p has been encountered, provided that

558

PCRE is compiled with support for Unicode properties. On entry, ptrptr is

559

pointing at the P or p. On exit, it is pointing at the final character of the

560

escape sequence.

561

562

Argument:

563

ptrptr points to the pattern position pointer

564

negptr points to a boolean that is set TRUE for negation else FALSE

565

errorcodeptr points to the error code variable

566

567

Returns: value from ucp_type_table, or -1 for an invalid type

568

569

570

static int

571

get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)

572

{

573

int c, i, bot, top;

574

const uschar *ptr = *ptrptr;

575

char name[4];

576

577

c = *(++ptr);

578

if (c == 0) goto ERROR_RETURN;

579

580

*negptr = FALSE;

581

582

/* \P or \p can be followed by a one- or two-character name in {}, optionally

583

preceded by ^ for negation. */

584

585

if (c == '{')

586

{

587

if (ptr[1] == '^')

588

{

589

*negptr = TRUE;

590

ptr++;

591

}

592

for (i = 0; i <= 2; i++)

593

{

594

c = *(++ptr);

595

if (c == 0) goto ERROR_RETURN;

596

if (c == '}') break;

597

name[i] = c;

598

}

599

if (c !='}') /* Try to distinguish error cases */

600

{

601

while (*(++ptr) != 0 && *ptr != '}');

602

if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;

603

}

604

name[i] = 0;

605

}

606

607

/* Otherwise there is just one following character */

608

609

else

610

{

611

name[0] = c;

612

name[1] = 0;

613

}

614

615

*ptrptr = ptr;

616

617

/* Search for a recognized property name using binary chop */

618

619

bot = 0;

620

top = _pcre_utt_size;

621

622

while (bot < top)

623

{

624

i = (bot + top)/2;

625

c = strcmp(name, _pcre_utt[i].name);

626

if (c == 0) return _pcre_utt[i].value;

627

if (c > 0) bot = i + 1; else top = i;

628

}

629

630

UNKNOWN_RETURN:

631

*errorcodeptr = ERR47;

632

*ptrptr = ptr;

633

return -1;

634

635

ERROR_RETURN:

636

*errorcodeptr = ERR46;

637

*ptrptr = ptr;

638

return -1;

639

}

640

#endif

641

642

643

644

645

/*************************************************

646

* Check for counted repeat *

647

*************************************************/

648

649

/* This function is called when a '{' is encountered in a place where it might

650

start a quantifier. It looks ahead to see if it really is a quantifier or not.

651

It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}

652

where the ddds are digits.

653

654

Arguments:

655

p pointer to the first char after '{'

656

657

Returns: TRUE or FALSE

658

659

660

static BOOL

661

is_counted_repeat(const uschar *p)

662

{

663

if ((digitab[*p++] & ctype_digit) == 0) return FALSE;

664

while ((digitab[*p] & ctype_digit) != 0) p++;

665

if (*p == '}') return TRUE;

666

667

if (*p++ != ',') return FALSE;

668

if (*p == '}') return TRUE;

669

670

if ((digitab[*p++] & ctype_digit) == 0) return FALSE;

671

while ((digitab[*p] & ctype_digit) != 0) p++;

672

673

return (*p == '}');

674

}

675

676

677

678

/*************************************************

679

* Read repeat counts *

680

*************************************************/

681

682

/* Read an item of the form {n,m} and return the values. This is called only

683

after is_counted_repeat() has confirmed that a repeat-count quantifier exists,

684

so the syntax is guaranteed to be correct, but we need to check the values.

685

686

Arguments:

687

p pointer to first char after '{'

688

minp pointer to int for min

689

maxp pointer to int for max

690

returned as -1 if no max

691

errorcodeptr points to error code variable

692

693

Returns: pointer to '}' on success;

694

current ptr on error, with errorcodeptr set non-zero

695

696

697

static const uschar *

698

read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)

699

{

700

int min = 0;

701

int max = -1;

702

703

while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';

704

705

if (*p == '}') max = min; else

706

{

707

if (*(++p) != '}')

708

{

709

max = 0;

710

while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';

711

if (max < min)

712

{

713

*errorcodeptr = ERR4;

714

return p;

715

}

716

}

717

}

718

719

/* Do paranoid checks, then fill in the required variables, and pass back the

720

pointer to the terminating '}'. */

721

722

if (min > 65535 || max > 65535)

723

*errorcodeptr = ERR5;

724

else

725

{

726

*minp = min;

727

*maxp = max;

728

}

729

return p;

730

}

731

732

733

734

/*************************************************

735

* Find first significant op code *

736

*************************************************/

737

738

/* This is called by several functions that scan a compiled expression looking

739

for a fixed first character, or an anchoring op code etc. It skips over things

740

that do not influence this. For some calls, a change of option is important.

741

For some calls, it makes sense to skip negative forward and all backward

742

assertions, and also the \b assertion; for others it does not.

743

744

Arguments:

745

code pointer to the start of the group

746

options pointer to external options

747

optbit the option bit whose changing is significant, or

748

zero if none are

749

skipassert TRUE if certain assertions are to be skipped

750

751

Returns: pointer to the first significant opcode

752

753

754

static const uschar*

755

first_significant_code(const uschar *code, int *options, int optbit,

756

BOOL skipassert)

757

{

758

for (;;)

759

{

760

switch ((int)*code)

761

{

762

case OP_OPT:

763

if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))

764

*options = (int)code[1];

765

code += 2;

766

break;

767

768

case OP_ASSERT_NOT:

769

case OP_ASSERTBACK:

770

case OP_ASSERTBACK_NOT:

771

if (!skipassert) return code;

772

do code += GET(code, 1); while (*code == OP_ALT);

773

code += _pcre_OP_lengths[*code];

774

break;

775

776

case OP_WORD_BOUNDARY:

777

case OP_NOT_WORD_BOUNDARY:

778

if (!skipassert) return code;

779

/* Fall through */

780

781

case OP_CALLOUT:

782

case OP_CREF:

783

case OP_BRANUMBER:

784

code += _pcre_OP_lengths[*code];

785

break;

786

787

default:

788

return code;

789

}

790

}

791

/* Control never reaches here */

792

}

793

794

795

796

797

/*************************************************

798

* Find the fixed length of a pattern *

799

*************************************************/

800

801

/* Scan a pattern and compute the fixed length of subject that will match it,

802

if the length is fixed. This is needed for dealing with backward assertions.

803

In UTF8 mode, the result is in characters rather than bytes.

804

805

Arguments:

806

code points to the start of the pattern (the bracket)

807

options the compiling options

808

809

Returns: the fixed length, or -1 if there is no fixed length,

810

or -2 if \C was encountered

811

812

813

static int

814

find_fixedlength(uschar *code, int options)

815

{

816

int length = -1;

817

818

819

820

821

/* Scan along the opcodes for this branch. If we get to the end of the

822

branch, check the length against that of the other branches. */

823

824

for (;;)

825

{

826

int d;

827

828

if (op >= OP_BRA) op = OP_BRA;

829

830

switch (op)

831

{

832

case OP_BRA:

833

case OP_ONCE:

834

case OP_COND:

835

d = find_fixedlength(cc, options);

836

if (d < 0) return d;

837

branchlength += d;

838

do cc += GET(cc, 1); while (*cc == OP_ALT);

839

cc += 1 + LINK_SIZE;

840

break;

841

842

/* Reached end of a branch; if it's a ket it is the end of a nested

843

call. If it's ALT it is an alternation in a nested call. If it is

844

END it's the end of the outer call. All can be handled by the same code. */

845

846

case OP_ALT:

847

case OP_KET:

848

case OP_KETRMAX:

849

case OP_KETRMIN:

850

case OP_END:

851

if (length < 0) length = branchlength;

852

else if (length != branchlength) return -1;

853

if (*cc != OP_ALT) return length;

854

cc += 1 + LINK_SIZE;

855

branchlength = 0;

856

break;

857

858

/* Skip over assertive subpatterns */

859

860

case OP_ASSERT:

861

case OP_ASSERT_NOT:

862

case OP_ASSERTBACK:

863

case OP_ASSERTBACK_NOT:

864

do cc += GET(cc, 1); while (*cc == OP_ALT);

865

/* Fall through */

866

867

/* Skip over things that don't match chars */

868

869

case OP_REVERSE:

870

case OP_BRANUMBER:

871

case OP_CREF:

872

case OP_OPT:

873

case OP_CALLOUT:

874

case OP_SOD:

875

case OP_SOM:

876

case OP_EOD:

877

case OP_EODN:

878

case OP_CIRC:

879

case OP_DOLL:

880

case OP_NOT_WORD_BOUNDARY:

881

case OP_WORD_BOUNDARY:

882

cc += _pcre_OP_lengths[*cc];

883

break;

884

885

/* Handle literal characters */

886

887

case OP_CHAR:

888

case OP_CHARNC:

889

branchlength++;

890

cc += 2;

891

#ifdef SUPPORT_UTF8

892

if ((options & PCRE_UTF8) != 0)

893

{

894

while ((*cc & 0xc0) == 0x80) cc++;

895

}

896

#endif

897

break;

898

899

/* Handle exact repetitions. The count is already in characters, but we

900

need to skip over a multibyte character in UTF8 mode. */

901

902

case OP_EXACT:

903

branchlength += GET2(cc,1);

904

cc += 4;

905

#ifdef SUPPORT_UTF8

906

if ((options & PCRE_UTF8) != 0)

907

{

908

while((*cc & 0x80) == 0x80) cc++;

909

}

910

#endif

911

break;

912

913

case OP_TYPEEXACT:

914

branchlength += GET2(cc,1);

915

cc += 4;

916

break;

917

918

/* Handle single-char matchers */

919

920

case OP_PROP:

921

case OP_NOTPROP:

922

cc++;

923

/* Fall through */

924

925

case OP_NOT_DIGIT:

926

case OP_DIGIT:

927

case OP_NOT_WHITESPACE:

928

case OP_WHITESPACE:

929

case OP_NOT_WORDCHAR:

930

case OP_WORDCHAR:

931

case OP_ANY:

932

branchlength++;

933

cc++;

934

break;

935

936

/* The single-byte matcher isn't allowed */

937

938

case OP_ANYBYTE:

939

return -2;

940

941

/* Check a class for variable quantification */

942

943

#ifdef SUPPORT_UTF8

944

case OP_XCLASS:

945

cc += GET(cc, 1) - 33;

946

/* Fall through */

947

#endif

948

949

case OP_CLASS:

950

case OP_NCLASS:

951

cc += 33;

952

953

switch (*cc)

954

{

955

case OP_CRSTAR:

956

case OP_CRMINSTAR:

957

case OP_CRQUERY:

958

case OP_CRMINQUERY:

959

return -1;

960

961

case OP_CRRANGE:

962

case OP_CRMINRANGE:

963

if (GET2(cc,1) != GET2(cc,3)) return -1;

964

branchlength += GET2(cc,1);

965

cc += 5;

966

break;

967

968

default:

969

branchlength++;

970

}

971

break;

972

973

/* Anything else is variable length */

974

975

default:

976

return -1;

977

}

978

}

979

/* Control never gets here */

980

}

981

982

983

984

985

/*************************************************

986

* Scan compiled regex for numbered bracket *

987

*************************************************/

988

989

/* This little function scans through a compiled pattern until it finds a

990

capturing bracket with the given number.

991

992

Arguments:

993

code points to start of expression

994

utf8 TRUE in UTF-8 mode

995

number the required bracket number

996

997

Returns: pointer to the opcode for the bracket, or NULL if not found

998

999

1000

static const uschar *

1001

find_bracket(const uschar *code, BOOL utf8, int number)

1002

{

1003

#ifndef SUPPORT_UTF8

1004

utf8 = utf8; /* Stop pedantic compilers complaining */

1005

#endif

1006

1007

for (;;)

1008

{

1009

1010

if (c == OP_END) return NULL;

1011

else if (c > OP_BRA)

1012

{

1013

int n = c - OP_BRA;

1014

if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);

1015

if (n == number) return (uschar *)code;

1016

code += _pcre_OP_lengths[OP_BRA];

1017

}

1018

else

1019

{

1020

code += _pcre_OP_lengths[c];

1021

1022

#ifdef SUPPORT_UTF8

1023

1024

/* In UTF-8 mode, opcodes that are followed by a character may be followed

1025

by a multi-byte character. The length in the table is a minimum, so we have

1026

to scan along to skip the extra bytes. All opcodes are less than 128, so we

1027

can use relatively efficient code. */

1028

1029

if (utf8) switch(c)

1030

{

1031

case OP_CHAR:

1032

case OP_CHARNC:

1033

case OP_EXACT:

1034

case OP_UPTO:

1035

case OP_MINUPTO:

1036

case OP_STAR:

1037

case OP_MINSTAR:

1038

case OP_PLUS:

1039

case OP_MINPLUS:

1040

case OP_QUERY:

1041

case OP_MINQUERY:

1042

while ((*code & 0xc0) == 0x80) code++;

1043

break;

1044

1045

/* XCLASS is used for classes that cannot be represented just by a bit

1046

map. This includes negated single high-valued characters. The length in

1047

the table is zero; the actual length is stored in the compiled code. */

1048

1049

case OP_XCLASS:

1050

code += GET(code, 1) + 1;

1051

break;

1052

}

1053

#endif

1054

}

1055

}

1056

}

1057

1058

1059

1060

/*************************************************

1061

* Scan compiled regex for recursion reference *

1062

*************************************************/

1063

1064

/* This little function scans through a compiled pattern until it finds an

1065

instance of OP_RECURSE.

1066

1067

Arguments:

1068

code points to start of expression

1069

utf8 TRUE in UTF-8 mode

1070

1071

Returns: pointer to the opcode for OP_RECURSE, or NULL if not found

1072

1073

1074

static const uschar *

1075

find_recurse(const uschar *code, BOOL utf8)

1076

{

1077

#ifndef SUPPORT_UTF8

1078

utf8 = utf8; /* Stop pedantic compilers complaining */

1079

#endif

1080

1081

for (;;)

1082

{

1083

1084

if (c == OP_END) return NULL;

1085

else if (c == OP_RECURSE) return code;

1086

else if (c > OP_BRA)

1087

{

1088

code += _pcre_OP_lengths[OP_BRA];

1089

}

1090

else

1091

{

1092

code += _pcre_OP_lengths[c];

1093

1094

#ifdef SUPPORT_UTF8

1095

1096

/* In UTF-8 mode, opcodes that are followed by a character may be followed

1097

by a multi-byte character. The length in the table is a minimum, so we have

1098

to scan along to skip the extra bytes. All opcodes are less than 128, so we

1099

can use relatively efficient code. */

1100

1101

if (utf8) switch(c)

1102

{

1103

case OP_CHAR:

1104

case OP_CHARNC:

1105

case OP_EXACT:

1106

case OP_UPTO:

1107

case OP_MINUPTO:

1108

case OP_STAR:

1109

case OP_MINSTAR:

1110

case OP_PLUS:

1111

case OP_MINPLUS:

1112

case OP_QUERY:

1113

case OP_MINQUERY:

1114

while ((*code & 0xc0) == 0x80) code++;

1115

break;

1116

1117

/* XCLASS is used for classes that cannot be represented just by a bit

1118

map. This includes negated single high-valued characters. The length in

1119

the table is zero; the actual length is stored in the compiled code. */

1120

1121

case OP_XCLASS:

1122

code += GET(code, 1) + 1;

1123

break;

1124

}

1125

#endif

1126

}

1127

}

1128

}

1129

1130

1131

1132

/*************************************************

1133

* Scan compiled branch for non-emptiness *

1134

*************************************************/

1135

1136

/* This function scans through a branch of a compiled pattern to see whether it

1137

can match the empty string or not. It is called only from could_be_empty()

1138

below. Note that first_significant_code() skips over assertions. If we hit an

1139

unclosed bracket, we return "empty" - this means we've struck an inner bracket

1140

whose current branch will already have been scanned.

1141

1142

Arguments:

1143

code points to start of search

1144

endcode points to where to stop

1145

utf8 TRUE if in UTF8 mode

1146

1147

Returns: TRUE if what is matched could be empty

1148

1149

1150

static BOOL

1151

could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)

1152

{

1153

1154

for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);

1155

code < endcode;

1156

code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))

1157

{

1158

const uschar *ccode;

1159

1160

c = *code;

1161

1162

if (c >= OP_BRA)

1163

{

1164

BOOL empty_branch;

1165

if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */

1166

1167

/* Scan a closed bracket */

1168

1169

empty_branch = FALSE;

1170

1171

{

1172

if (!empty_branch && could_be_empty_branch(code, endcode, utf8))

1173

empty_branch = TRUE;

1174

code += GET(code, 1);

1175

}

1176

while (*code == OP_ALT);

1177

if (!empty_branch) return FALSE; /* All branches are non-empty */

1178

code += 1 + LINK_SIZE;

1179

c = *code;

1180

}

1181

1182

else switch (c)

1183

{

1184

/* Check for quantifiers after a class */

1185

1186

#ifdef SUPPORT_UTF8

1187

case OP_XCLASS:

1188

ccode = code + GET(code, 1);

1189

goto CHECK_CLASS_REPEAT;

1190

#endif

1191

1192

case OP_CLASS:

1193

case OP_NCLASS:

1194

ccode = code + 33;

1195

1196

#ifdef SUPPORT_UTF8

1197

CHECK_CLASS_REPEAT:

1198

#endif

1199

1200

switch (*ccode)

1201

{

1202

case OP_CRSTAR: /* These could be empty; continue */

1203

case OP_CRMINSTAR:

1204

case OP_CRQUERY:

1205

case OP_CRMINQUERY:

1206

break;

1207

1208

default: /* Non-repeat => class must match */

1209

case OP_CRPLUS: /* These repeats aren't empty */

1210

case OP_CRMINPLUS:

1211

return FALSE;

1212

1213

case OP_CRRANGE:

1214

case OP_CRMINRANGE:

1215

if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */

1216

break;

1217

}

1218

break;

1219

1220

/* Opcodes that must match a character */

1221

1222

case OP_PROP:

1223

case OP_NOTPROP:

1224

case OP_EXTUNI:

1225

case OP_NOT_DIGIT:

1226

case OP_DIGIT:

1227

case OP_NOT_WHITESPACE:

1228

case OP_WHITESPACE:

1229

case OP_NOT_WORDCHAR:

1230

case OP_WORDCHAR:

1231

case OP_ANY:

1232

case OP_ANYBYTE:

1233

case OP_CHAR:

1234

case OP_CHARNC:

1235

case OP_NOT:

1236

case OP_PLUS:

1237

case OP_MINPLUS:

1238

case OP_EXACT:

1239

case OP_NOTPLUS:

1240

case OP_NOTMINPLUS:

1241

case OP_NOTEXACT:

1242

case OP_TYPEPLUS:

1243

case OP_TYPEMINPLUS:

1244

case OP_TYPEEXACT:

1245

return FALSE;

1246

1247

/* End of branch */

1248

1249

case OP_KET:

1250

case OP_KETRMAX:

1251

case OP_KETRMIN:

1252

case OP_ALT:

1253

return TRUE;

1254

1255

/* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be

1256

followed by a multibyte character */

1257

1258

#ifdef SUPPORT_UTF8

1259

case OP_STAR:

1260

case OP_MINSTAR:

1261

case OP_QUERY:

1262

case OP_MINQUERY:

1263

case OP_UPTO:

1264

case OP_MINUPTO:

1265

if (utf8) while ((code[2] & 0xc0) == 0x80) code++;

1266

break;

1267

#endif

1268

}

1269

}

1270

1271

return TRUE;

1272

}

1273

1274

1275

1276

/*************************************************

1277

* Scan compiled regex for non-emptiness *

1278

*************************************************/

1279

1280

/* This function is called to check for left recursive calls. We want to check

1281

the current branch of the current pattern to see if it could match the empty

1282

string. If it could, we must look outwards for branches at other levels,

1283

stopping when we pass beyond the bracket which is the subject of the recursion.

1284

1285

Arguments:

1286

code points to start of the recursion

1287

endcode points to where to stop (current RECURSE item)

1288

bcptr points to the chain of current (unclosed) branch starts

1289

utf8 TRUE if in UTF-8 mode

1290

1291

Returns: TRUE if what is matched could be empty

1292

1293

1294

static BOOL

1295

could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,

1296

BOOL utf8)

1297

{

1298

while (bcptr != NULL && bcptr->current >= code)

1299

{

1300

if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;

1301

bcptr = bcptr->outer;

1302

}

1303

return TRUE;

1304

}

1305

1306

1307

1308

/*************************************************

1309

* Check for POSIX class syntax *

1310

*************************************************/

1311

1312

/* This function is called when the sequence "[:" or "[." or "[=" is

1313

encountered in a character class. It checks whether this is followed by an

1314

optional ^ and then a sequence of letters, terminated by a matching ":]" or

1315

".]" or "=]".

1316

1317

Argument:

1318

ptr pointer to the initial [

1319

endptr where to return the end pointer

1320

cd pointer to compile data

1321

1322

Returns: TRUE or FALSE

1323

1324

1325

static BOOL

1326

check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)

1327

{

1328

int terminator; /* Don't combine these lines; the Solaris cc */

1329

terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */

1330

if (*(++ptr) == '^') ptr++;

1331

while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;

1332

if (*ptr == terminator && ptr[1] == ']')

1333

{

1334

*endptr = ptr;

1335

return TRUE;

1336

}

1337

return FALSE;

1338

}

1339

1340

1341

1342

1343

/*************************************************

1344

* Check POSIX class name *

1345

*************************************************/

1346

1347

/* This function is called to check the name given in a POSIX-style class entry

1348

such as [:alnum:].

1349

1350

Arguments:

1351

ptr points to the first letter

1352

len the length of the name

1353

1354

Returns: a value representing the name, or -1 if unknown

1355

1356

1357

static int

1358

check_posix_name(const uschar *ptr, int len)

1359

{

1360

1361

while (posix_name_lengths[yield] != 0)

1362

{

1363

if (len == posix_name_lengths[yield] &&

1364

strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;

1365

yield++;

1366

}

1367

return -1;

1368

}

1369

1370

1371

/*************************************************

1372

* Adjust OP_RECURSE items in repeated group *

1373

*************************************************/

1374

1375

/* OP_RECURSE items contain an offset from the start of the regex to the group

1376

that is referenced. This means that groups can be replicated for fixed

1377

repetition simply by copying (because the recursion is allowed to refer to

1378

earlier groups that are outside the current group). However, when a group is

1379

optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before

1380

it, after it has been compiled. This means that any OP_RECURSE items within it

1381

that refer to the group itself or any contained groups have to have their

1382

offsets adjusted. That is the job of this function. Before it is called, the

1383

partially compiled regex must be temporarily terminated with OP_END.

1384

1385

Arguments:

1386

group points to the start of the group

1387

adjust the amount by which the group is to be moved

1388

utf8 TRUE in UTF-8 mode

1389

cd contains pointers to tables etc.

1390

1391

Returns: nothing

1392

1393

1394

static void

1395

adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)

1396

{

1397

uschar *ptr = group;

1398

while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)

1399

{

1400

int offset = GET(ptr, 1);

1401

if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);

1402

ptr += 1 + LINK_SIZE;

1403

}

1404

}

1405

1406

1407

1408

/*************************************************

1409

* Insert an automatic callout point *

1410

*************************************************/

1411

1412

/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert

1413

callout points before each pattern item.

1414

1415

Arguments:

1416

code current code pointer

1417

ptr current pattern pointer

1418

cd pointers to tables etc

1419

1420

Returns: new code pointer

1421

1422

1423

static uschar *

1424

auto_callout(uschar *code, const uschar *ptr, compile_data *cd)

1425

{

1426

*code++ = OP_CALLOUT;

1427

*code++ = 255;

1428

PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */

1429

PUT(code, LINK_SIZE, 0); /* Default length */

1430

return code + 2*LINK_SIZE;

1431

}

1432

1433

1434

1435

/*************************************************

1436

* Complete a callout item *

1437

*************************************************/

1438

1439

/* A callout item contains the length of the next item in the pattern, which

1440

we can't fill in till after we have reached the relevant point. This is used

1441

for both automatic and manual callouts.

1442

1443

Arguments:

1444

previous_callout points to previous callout item

1445

ptr current pattern pointer

1446

cd pointers to tables etc

1447

1448

Returns: nothing

1449

1450

1451

static void

1452

complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)

1453

{

1454

int length = ptr - cd->start_pattern - GET(previous_callout, 2);

1455

PUT(previous_callout, 2 + LINK_SIZE, length);

1456

}

1457

1458

1459

1460

#ifdef SUPPORT_UCP

1461

/*************************************************

1462

* Get othercase range *

1463

*************************************************/

1464

1465

/* This function is passed the start and end of a class range, in UTF-8 mode

1466

with UCP support. It searches up the characters, looking for internal ranges of

1467

characters in the "other" case. Each call returns the next one, updating the

1468

start address.

1469

1470

Arguments:

1471

cptr points to starting character value; updated

1472

d end value

1473

ocptr where to put start of othercase range

1474

odptr where to put end of othercase range

1475

1476

Yield: TRUE when range returned; FALSE when no more

1477

1478

1479

static BOOL

1480

get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)

1481

{

1482

int c, chartype, othercase, next;

1483

1484

for (c = *cptr; c <= d; c++)

1485

{

1486

if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)

1487

break;

1488

}

1489

1490

if (c > d) return FALSE;

1491

1492

*ocptr = othercase;

1493

next = othercase + 1;

1494

1495

for (++c; c <= d; c++)

1496

{

1497

if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||

1498

othercase != next)

1499

break;

1500

next++;

1501

}

1502

1503

*odptr = next - 1;

1504

*cptr = c;

1505

1506

return TRUE;

1507

}

1508

#endif /* SUPPORT_UCP */

1509

1510

1511

/*************************************************

1512

* Compile one branch *

1513

*************************************************/

1514

1515

/* Scan the pattern, compiling it into the code vector. If the options are

1516

changed during the branch, the pointer is used to change the external options

1517

bits.

1518

1519

Arguments:

1520

optionsptr pointer to the option bits

1521

brackets points to number of extracting brackets used

1522

codeptr points to the pointer to the current code point

1523

ptrptr points to the current pattern pointer

1524

errorcodeptr points to error code variable

1525

firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)

1526

reqbyteptr set to the last literal character required, else < 0

1527

bcptr points to current branch chain

1528

cd contains pointers to tables etc.

1529

1530

Returns: TRUE on success

1531

FALSE, with *errorcodeptr set non-zero on error

1532

1533

1534

static BOOL

1535

compile_branch(int *optionsptr, int *brackets, uschar **codeptr,

1536

const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,

1537

int *reqbyteptr, branch_chain *bcptr, compile_data *cd)

1538

{

1539

int repeat_type, op_type;

1540

int repeat_min = 0, repeat_max = 0; /* To please picky compilers */

1541

int bravalue = 0;

1542

int greedy_default, greedy_non_default;

1543

int firstbyte, reqbyte;

1544

int zeroreqbyte, zerofirstbyte;

1545

int req_caseopt, reqvary, tempreqvary;

1546

int condcount = 0;

1547

int options = *optionsptr;

1548

int after_manual_callout = 0;

1549

1550

1551

uschar *tempcode;

1552

BOOL inescq = FALSE;

1553

BOOL groupsetfirstbyte = FALSE;

1554

const uschar *ptr = *ptrptr;

1555

const uschar *tempptr;

1556

uschar *previous = NULL;

1557

uschar *previous_callout = NULL;

1558

uschar classbits[32];

1559

1560

#ifdef SUPPORT_UTF8

1561

BOOL class_utf8;

1562

BOOL utf8 = (options & PCRE_UTF8) != 0;

1563

uschar *class_utf8data;

1564

uschar utf8_char[6];

1565

#else

1566

BOOL utf8 = FALSE;

1567

#endif

1568

1569

/* Set up the default and non-default settings for greediness */

1570

1571

greedy_default = ((options & PCRE_UNGREEDY) != 0);

1572

greedy_non_default = greedy_default ^ 1;

1573

1574

/* Initialize no first byte, no required byte. REQ_UNSET means "no char

1575

matching encountered yet". It gets changed to REQ_NONE if we hit something that

1576

matches a non-fixed char first char; reqbyte just remains unset if we never

1577

find one.

1578

1579

When we hit a repeat whose minimum is zero, we may have to adjust these values

1580

to take the zero repeat into account. This is implemented by setting them to

1581

zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual

1582

item types that can be repeated set these backoff variables appropriately. */

1583

1584

firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;

1585

1586

/* The variable req_caseopt contains either the REQ_CASELESS value or zero,

1587

according to the current setting of the caseless flag. REQ_CASELESS is a bit

1588

value > 255. It is added into the firstbyte or reqbyte variables to record the

1589

case status of the value. This is used only for ASCII characters. */

1590

1591

req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;

1592

1593

/* Switch on next character until the end of the branch */

1594

1595

for (;; ptr++)

1596

{

1597

BOOL negate_class;

1598

BOOL possessive_quantifier;

1599

BOOL is_quantifier;

1600

int class_charcount;

1601

int class_lastchar;

1602

int newoptions;

1603

int recno;

1604

int skipbytes;

1605

int subreqbyte;

1606

int subfirstbyte;

1607

int mclength;

1608

uschar mcbuffer[8];

1609

1610

/* Next byte in the pattern */

1611

1612

c = *ptr;

1613

1614

/* If in \Q...\E, check for the end; if not, we have a literal */

1615

1616

if (inescq && c != 0)

1617

{

1618

if (c == '\\' && ptr[1] == 'E')

1619

{

1620

inescq = FALSE;

1621

ptr++;

1622

continue;

1623

}

1624

else

1625

{

1626

if (previous_callout != NULL)

1627

{

1628

complete_callout(previous_callout, ptr, cd);

1629

previous_callout = NULL;

1630

}

1631

if ((options & PCRE_AUTO_CALLOUT) != 0)

1632

{

1633

previous_callout = code;

1634

code = auto_callout(code, ptr, cd);

1635

}

1636

goto NORMAL_CHAR;

1637

}

1638

}

1639

1640

/* Fill in length of a previous callout, except when the next thing is

1641

a quantifier. */

1642

1643

is_quantifier = c == '*' || c == '+' || c == '?' ||

1644

(c == '{' && is_counted_repeat(ptr+1));

1645

1646

if (!is_quantifier && previous_callout != NULL &&

1647

after_manual_callout-- <= 0)

1648

{

1649

complete_callout(previous_callout, ptr, cd);

1650

previous_callout = NULL;

1651

}

1652

1653

/* In extended mode, skip white space and comments */

1654

1655

if ((options & PCRE_EXTENDED) != 0)

1656

{

1657

if ((cd->ctypes[c] & ctype_space) != 0) continue;

1658

if (c == '#')

1659

{

1660

/* The space before the ; is to avoid a warning on a silly compiler

1661

on the Macintosh. */

1662

while ((c = *(++ptr)) != 0 && c != NEWLINE) ;

1663

if (c != 0) continue; /* Else fall through to handle end of string */

1664

}

1665

}

1666

1667

/* No auto callout for quantifiers. */

1668

1669

if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)

1670

{

1671

previous_callout = code;

1672

code = auto_callout(code, ptr, cd);

1673

}

1674

1675

switch(c)

1676

{

1677

/* The branch terminates at end of string, |, or ). */

1678

1679

case 0:

1680

case '|':

1681

case ')':

1682

*firstbyteptr = firstbyte;

1683

*reqbyteptr = reqbyte;

1684

*codeptr = code;

1685

*ptrptr = ptr;

1686

return TRUE;

1687

1688

/* Handle single-character metacharacters. In multiline mode, ^ disables

1689

the setting of any following char as a first character. */

1690

1691

case '^':

1692

if ((options & PCRE_MULTILINE) != 0)

1693

{

1694

if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;

1695

}

1696

previous = NULL;

1697

*code++ = OP_CIRC;

1698

break;

1699

1700

case '$':

1701

previous = NULL;

1702

*code++ = OP_DOLL;

1703

break;

1704

1705

/* There can never be a first char if '.' is first, whatever happens about

1706

repeats. The value of reqbyte doesn't change either. */

1707

1708

case '.':

1709

if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;

1710

zerofirstbyte = firstbyte;

1711

zeroreqbyte = reqbyte;

1712

previous = code;

1713

*code++ = OP_ANY;

1714

break;

1715

1716

/* Character classes. If the included characters are all < 255 in value, we

1717

build a 32-byte bitmap of the permitted characters, except in the special

1718

case where there is only one such character. For negated classes, we build

1719

the map as usual, then invert it at the end. However, we use a different

1720

opcode so that data characters > 255 can be handled correctly.

1721

1722

If the class contains characters outside the 0-255 range, a different

1723

opcode is compiled. It may optionally have a bit map for characters < 256,

1724

but those above are are explicitly listed afterwards. A flag byte tells

1725

whether the bitmap is present, and whether this is a negated class or not.

1726

1727

1728

case '[':

1729

previous = code;

1730

1731

/* PCRE supports POSIX class stuff inside a class. Perl gives an error if

1732

they are encountered at the top level, so we'll do that too. */

1733

1734

if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&

1735

check_posix_syntax(ptr, &tempptr, cd))

1736

{

1737

*errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;

1738

goto FAILED;

1739

}

1740

1741

/* If the first character is '^', set the negation flag and skip it. */

1742

1743

if ((c = *(++ptr)) == '^')

1744

{

1745

negate_class = TRUE;

1746

c = *(++ptr);

1747

}

1748

else

1749

{

1750

negate_class = FALSE;

1751

}

1752

1753

/* Keep a count of chars with values < 256 so that we can optimize the case

1754

of just a single character (as long as it's < 256). For higher valued UTF-8

1755

characters, we don't yet do any optimization. */

1756

1757

class_charcount = 0;

1758

class_lastchar = -1;

1759

1760

#ifdef SUPPORT_UTF8

1761

class_utf8 = FALSE; /* No chars >= 256 */

1762

class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */

1763

#endif

1764

1765

/* Initialize the 32-char bit map to all zeros. We have to build the

1766

map in a temporary bit of store, in case the class contains only 1

1767

character (< 256), because in that case the compiled code doesn't use the

1768

bit map. */

1769

1770

memset(classbits, 0, 32 * sizeof(uschar));

1771

1772

/* Process characters until ] is reached. By writing this as a "do" it

1773

means that an initial ] is taken as a data character. The first pass

1774

through the regex checked the overall syntax, so we don't need to be very

1775

strict here. At the start of the loop, c contains the first byte of the

1776

character. */

1777

1778

1779

{

1780

#ifdef SUPPORT_UTF8

1781

if (utf8 && c > 127)

1782

{ /* Braces are required because the */

1783

GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */

1784

}

1785

#endif

1786

1787

/* Inside \Q...\E everything is literal except \E */

1788

1789

if (inescq)

1790

{

1791

if (c == '\\' && ptr[1] == 'E')

1792

{

1793

inescq = FALSE;

1794

ptr++;

1795

continue;

1796

}

1797

else goto LONE_SINGLE_CHARACTER;

1798

}

1799

1800

/* Handle POSIX class names. Perl allows a negation extension of the

1801

form [:^name:]. A square bracket that doesn't match the syntax is

1802

treated as a literal. We also recognize the POSIX constructions

1803

[.ch.] and [=ch=] ("collating elements") and fault them, as Perl

1804

5.6 and 5.8 do. */

1805

1806

if (c == '[' &&

1807

(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&

1808

check_posix_syntax(ptr, &tempptr, cd))

1809

{

1810

BOOL local_negate = FALSE;

1811

int posix_class, i;

1812

1813

1814

if (ptr[1] != ':')

1815

{

1816

*errorcodeptr = ERR31;

1817

goto FAILED;

1818

}

1819

1820

ptr += 2;

1821

if (*ptr == '^')

1822

{

1823

local_negate = TRUE;

1824

ptr++;

1825

}

1826

1827

posix_class = check_posix_name(ptr, tempptr - ptr);

1828

if (posix_class < 0)

1829

{

1830

*errorcodeptr = ERR30;

1831

goto FAILED;

1832

}

1833

1834

/* If matching is caseless, upper and lower are converted to

1835

alpha. This relies on the fact that the class table starts with

1836

alpha, lower, upper as the first 3 entries. */

1837

1838

if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)

1839

posix_class = 0;

1840

1841

/* Or into the map we are building up to 3 of the static class

1842

tables, or their negations. The [:blank:] class sets up the same

1843

chars as the [:space:] class (all white space). We remove the vertical

1844

white space chars afterwards. */

1845

1846

posix_class *= 3;

1847

for (i = 0; i < 3; i++)

1848

{

1849

BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;

1850

int taboffset = posix_class_maps[posix_class + i];

1851

if (taboffset < 0) break;

1852

if (local_negate)

1853

{

1854

if (i == 0)

1855

for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];

1856

else

1857

for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];

1858

if (blankclass) classbits[1] |= 0x3c;

1859

}

1860

else

1861

{

1862

for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];

1863

if (blankclass) classbits[1] &= ~0x3c;

1864

}

1865

}

1866

1867

ptr = tempptr + 1;

1868

class_charcount = 10; /* Set > 1; assumes more than 1 per class */

1869

continue; /* End of POSIX syntax handling */

1870

}

1871

1872

/* Backslash may introduce a single character, or it may introduce one

1873

of the specials, which just set a flag. Escaped items are checked for

1874

validity in the pre-compiling pass. The sequence \b is a special case.

1875

Inside a class (and only there) it is treated as backspace. Elsewhere

1876

it marks a word boundary. Other escapes have preset maps ready to

1877

or into the one we are building. We assume they have more than one

1878

character in them, so set class_charcount bigger than one. */

1879

1880

if (c == '\\')

1881

{

1882

c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);

1883

1884

if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */

1885

else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */

1886

else if (-c == ESC_Q) /* Handle start of quoted string */

1887

{

1888

if (ptr[1] == '\\' && ptr[2] == 'E')

1889

{

1890

ptr += 2; /* avoid empty string */

1891

}

1892

else inescq = TRUE;

1893

continue;

1894

}

1895

1896

if (c < 0)

1897

{

1898

1899

class_charcount += 2; /* Greater than 1 is what matters */

1900

switch (-c)

1901

{

1902

case ESC_d:

1903

for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];

1904

continue;

1905

1906

case ESC_D:

1907

for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];

1908

continue;

1909

1910

case ESC_w:

1911

for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];

1912

continue;

1913

1914

case ESC_W:

1915

for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];

1916

continue;

1917

1918

case ESC_s:

1919

for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];

1920

classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */

1921

continue;

1922

1923

case ESC_S:

1924

for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];

1925

classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */

1926

continue;

1927

1928

#ifdef SUPPORT_UCP

1929

case ESC_p:

1930

case ESC_P:

1931

{

1932

BOOL negated;

1933

int property = get_ucp(&ptr, &negated, errorcodeptr);

1934

if (property < 0) goto FAILED;

1935

class_utf8 = TRUE;

1936

*class_utf8data++ = ((-c == ESC_p) != negated)?

1937

XCL_PROP : XCL_NOTPROP;

1938

*class_utf8data++ = property;

1939

class_charcount -= 2; /* Not a < 256 character */

1940

}

1941

continue;

1942

#endif

1943

1944

/* Unrecognized escapes are faulted if PCRE is running in its

1945

strict mode. By default, for compatibility with Perl, they are

1946

treated as literals. */

1947

1948

default:

1949

if ((options & PCRE_EXTRA) != 0)

1950

{

1951

*errorcodeptr = ERR7;

1952

goto FAILED;

1953

}

1954

c = *ptr; /* The final character */

1955

class_charcount -= 2; /* Undo the default count from above */

1956

}

1957

}

1958

1959

/* Fall through if we have a single character (c >= 0). This may be

1960

> 256 in UTF-8 mode. */

1961

1962

} /* End of backslash handling */

1963

1964

/* A single character may be followed by '-' to form a range. However,

1965

Perl does not permit ']' to be the end of the range. A '-' character

1966

here is treated as a literal. */

1967

1968

if (ptr[1] == '-' && ptr[2] != ']')

1969

{

1970

int d;

1971

ptr += 2;

1972

1973

#ifdef SUPPORT_UTF8

1974

if (utf8)

1975

{ /* Braces are required because the */

1976

GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */

1977

}

1978

else

1979

#endif

1980

d = *ptr; /* Not UTF-8 mode */

1981

1982

/* The second part of a range can be a single-character escape, but

1983

not any of the other escapes. Perl 5.6 treats a hyphen as a literal

1984

in such circumstances. */

1985

1986

if (d == '\\')

1987

{

1988

const uschar *oldptr = ptr;

1989

d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);

1990

1991

/* \b is backslash; \X is literal X; any other special means the '-'

1992

was literal */

1993

1994

if (d < 0)

1995

{

1996

if (d == -ESC_b) d = '\b';

1997

else if (d == -ESC_X) d = 'X'; else

1998

{

1999

ptr = oldptr - 2;

2000

goto LONE_SINGLE_CHARACTER; /* A few lines below */

2001

}

2002

}

2003

}

2004

2005

/* The check that the two values are in the correct order happens in

2006

the pre-pass. Optimize one-character ranges */

2007

2008

if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */

2009

2010

/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless

2011

matching, we have to use an XCLASS with extra data items. Caseless

2012

matching for characters > 127 is available only if UCP support is

2013

available. */

2014

2015

#ifdef SUPPORT_UTF8

2016

if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))

2017

{

2018

class_utf8 = TRUE;

2019

2020

/* With UCP support, we can find the other case equivalents of

2021

the relevant characters. There may be several ranges. Optimize how

2022

they fit with the basic range. */

2023

2024

#ifdef SUPPORT_UCP

2025

if ((options & PCRE_CASELESS) != 0)

2026

{

2027

int occ, ocd;

2028

int cc = c;

2029

int origd = d;

2030

while (get_othercase_range(&cc, origd, &occ, &ocd))

2031

{

2032

if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */

2033

2034

if (occ < c && ocd >= c - 1) /* Extend the basic range */

2035

{ /* if there is overlap, */

2036

c = occ; /* noting that if occ < c */

2037

continue; /* we can't have ocd > d */

2038

} /* because a subrange is */

2039

if (ocd > d && occ <= d + 1) /* always shorter than */

2040

{ /* the basic range. */

2041

d = ocd;

2042

continue;

2043

}

2044

2045

if (occ == ocd)

2046

{

2047

*class_utf8data++ = XCL_SINGLE;

2048

}

2049

else

2050

{

2051

*class_utf8data++ = XCL_RANGE;

2052

class_utf8data += _pcre_ord2utf8(occ, class_utf8data);

2053

}

2054

class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);

2055

}

2056

}

2057

#endif /* SUPPORT_UCP */

2058

2059

/* Now record the original range, possibly modified for UCP caseless

2060

overlapping ranges. */

2061

2062

*class_utf8data++ = XCL_RANGE;

2063

class_utf8data += _pcre_ord2utf8(c, class_utf8data);

2064

class_utf8data += _pcre_ord2utf8(d, class_utf8data);

2065

2066

/* With UCP support, we are done. Without UCP support, there is no

2067

caseless matching for UTF-8 characters > 127; we can use the bit map

2068

for the smaller ones. */

2069

2070

#ifdef SUPPORT_UCP

2071

continue; /* With next character in the class */

2072

#else

2073

if ((options & PCRE_CASELESS) == 0 || c > 127) continue;

2074

2075

/* Adjust upper limit and fall through to set up the map */

2076

2077

d = 127;

2078

2079

#endif /* SUPPORT_UCP */

2080

}

2081

#endif /* SUPPORT_UTF8 */

2082

2083

/* We use the bit map for all cases when not in UTF-8 mode; else

2084

ranges that lie entirely within 0-127 when there is UCP support; else

2085

for partial ranges without UCP support. */

2086

2087

for (; c <= d; c++)

2088

{

2089

classbits[c/8] |= (1 << (c&7));

2090

if ((options & PCRE_CASELESS) != 0)

2091

{

2092

int uc = cd->fcc[c]; /* flip case */

2093

classbits[uc/8] |= (1 << (uc&7));

2094

}

2095

class_charcount++; /* in case a one-char range */

2096

class_lastchar = c;

2097

}

2098

2099

continue; /* Go get the next char in the class */

2100

}

2101

2102

/* Handle a lone single character - we can get here for a normal

2103

non-escape char, or after \ that introduces a single character or for an

2104

apparent range that isn't. */

2105

2106

LONE_SINGLE_CHARACTER:

2107

2108

/* Handle a character that cannot go in the bit map */

2109

2110

#ifdef SUPPORT_UTF8

2111

if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))

2112

{

2113

class_utf8 = TRUE;

2114

*class_utf8data++ = XCL_SINGLE;

2115

class_utf8data += _pcre_ord2utf8(c, class_utf8data);

2116

2117

#ifdef SUPPORT_UCP

2118

if ((options & PCRE_CASELESS) != 0)

2119

{

2120

int chartype;

2121

int othercase;

2122

if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&

2123

othercase > 0)

2124

{

2125

*class_utf8data++ = XCL_SINGLE;

2126

class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);

2127

}

2128

}

2129

#endif /* SUPPORT_UCP */

2130

2131

}

2132

else

2133

#endif /* SUPPORT_UTF8 */

2134

2135

/* Handle a single-byte character */

2136

{

2137

classbits[c/8] |= (1 << (c&7));

2138

if ((options & PCRE_CASELESS) != 0)

2139

{

2140

c = cd->fcc[c]; /* flip case */

2141

classbits[c/8] |= (1 << (c&7));

2142

}

2143

class_charcount++;

2144

class_lastchar = c;

2145

}

2146

}

2147

2148

/* Loop until ']' reached; the check for end of string happens inside the

2149

loop. This "while" is the end of the "do" above. */

2150

2151

while ((c = *(++ptr)) != ']' || inescq);

2152

2153

/* If class_charcount is 1, we saw precisely one character whose value is

2154

less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we

2155

can optimize the negative case only if there were no characters >= 128

2156

because OP_NOT and the related opcodes like OP_NOTSTAR operate on

2157

single-bytes only. This is an historical hangover. Maybe one day we can

2158

tidy these opcodes to handle multi-byte characters.

2159

2160

The optimization throws away the bit map. We turn the item into a

2161

1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note

2162

that OP_NOT does not support multibyte characters. In the positive case, it

2163

can cause firstbyte to be set. Otherwise, there can be no first char if

2164

this item is first, whatever repeat count may follow. In the case of

2165

reqbyte, save the previous value for reinstating. */

2166

2167

#ifdef SUPPORT_UTF8

2168

if (class_charcount == 1 &&

2169

(!utf8 ||

2170

(!class_utf8 && (!negate_class || class_lastchar < 128))))

2171

2172

#else

2173

if (class_charcount == 1)

2174

#endif

2175

{

2176

zeroreqbyte = reqbyte;

2177

2178

/* The OP_NOT opcode works on one-byte characters only. */

2179

2180

if (negate_class)

2181

{

2182

if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;

2183

zerofirstbyte = firstbyte;

2184

*code++ = OP_NOT;

2185

*code++ = class_lastchar;

2186

break;

2187

}

2188

2189

/* For a single, positive character, get the value into mcbuffer, and

2190

then we can handle this with the normal one-character code. */

2191

2192

#ifdef SUPPORT_UTF8

2193

if (utf8 && class_lastchar > 127)

2194

mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);

2195

else

2196

#endif

2197

{

2198

mcbuffer[0] = class_lastchar;

2199

mclength = 1;

2200

}

2201

goto ONE_CHAR;

2202

} /* End of 1-char optimization */

2203

2204

/* The general case - not the one-char optimization. If this is the first

2205

thing in the branch, there can be no first char setting, whatever the

2206

repeat count. Any reqbyte setting must remain unchanged after any kind of

2207

repeat. */

2208

2209

if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;

2210

zerofirstbyte = firstbyte;

2211

zeroreqbyte = reqbyte;

2212

2213

/* If there are characters with values > 255, we have to compile an

2214

extended class, with its own opcode. If there are no characters < 256,

2215

we can omit the bitmap. */

2216

2217

#ifdef SUPPORT_UTF8

2218

if (class_utf8)

2219

{

2220

*class_utf8data++ = XCL_END; /* Marks the end of extra data */

2221

*code++ = OP_XCLASS;

2222

code += LINK_SIZE;

2223

*code = negate_class? XCL_NOT : 0;

2224

2225

/* If the map is required, install it, and move on to the end of

2226

the extra data */

2227

2228

if (class_charcount > 0)

2229

{

2230

*code++ |= XCL_MAP;

2231

memcpy(code, classbits, 32);

2232

code = class_utf8data;

2233

}

2234

2235

/* If the map is not required, slide down the extra data. */

2236

2237

else

2238

{

2239

int len = class_utf8data - (code + 33);

2240

memmove(code + 1, code + 33, len);

2241

code += len + 1;

2242

}

2243

2244

/* Now fill in the complete length of the item */

2245

2246

PUT(previous, 1, code - previous);

2247

break; /* End of class handling */

2248

}

2249

#endif

2250

2251

/* If there are no characters > 255, negate the 32-byte map if necessary,

2252

and copy it into the code vector. If this is the first thing in the branch,

2253

there can be no first char setting, whatever the repeat count. Any reqbyte

2254

setting must remain unchanged after any kind of repeat. */

2255

2256

if (negate_class)

2257

{

2258

*code++ = OP_NCLASS;

2259

for (c = 0; c < 32; c++) code[c] = ~classbits[c];

2260

}

2261

else

2262

{

2263

*code++ = OP_CLASS;

2264

memcpy(code, classbits, 32);

2265

}

2266

code += 32;

2267

break;

2268

2269

/* Various kinds of repeat; '{' is not necessarily a quantifier, but this

2270

has been tested above. */

2271

2272

case '{':

2273

if (!is_quantifier) goto NORMAL_CHAR;

2274

ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);

2275

if (*errorcodeptr != 0) goto FAILED;

2276

goto REPEAT;

2277

2278

case '*':

2279

repeat_min = 0;

2280

repeat_max = -1;

2281

goto REPEAT;

2282

2283

case '+':

2284

repeat_min = 1;

2285

repeat_max = -1;

2286

goto REPEAT;

2287

2288

case '?':

2289

repeat_min = 0;

2290

repeat_max = 1;

2291

2292

REPEAT:

2293

if (previous == NULL)

2294

{

2295

*errorcodeptr = ERR9;

2296

goto FAILED;

2297

}

2298

2299

if (repeat_min == 0)

2300

{

2301

firstbyte = zerofirstbyte; /* Adjust for zero repeat */

2302

reqbyte = zeroreqbyte; /* Ditto */

2303

}

2304

2305

/* Remember whether this is a variable length repeat */

2306

2307

reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;

2308

2309

op_type = 0; /* Default single-char op codes */

2310

possessive_quantifier = FALSE; /* Default not possessive quantifier */

2311

2312

/* Save start of previous item, in case we have to move it up to make space

2313

for an inserted OP_ONCE for the additional '+' extension. */

2314

2315

tempcode = previous;

2316

2317

/* If the next character is '+', we have a possessive quantifier. This

2318

implies greediness, whatever the setting of the PCRE_UNGREEDY option.

2319

If the next character is '?' this is a minimizing repeat, by default,

2320

but if PCRE_UNGREEDY is set, it works the other way round. We change the

2321

repeat type to the non-default. */

2322

2323

if (ptr[1] == '+')

2324

{

2325

repeat_type = 0; /* Force greedy */

2326

possessive_quantifier = TRUE;

2327

ptr++;

2328

}

2329

else if (ptr[1] == '?')

2330

{

2331

repeat_type = greedy_non_default;

2332

ptr++;

2333

}

2334

else repeat_type = greedy_default;

2335

2336

/* If previous was a recursion, we need to wrap it inside brackets so that

2337

it can be replicated if necessary. */

2338

2339

if (*previous == OP_RECURSE)

2340

{

2341

memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);

2342

code += 1 + LINK_SIZE;

2343

*previous = OP_BRA;

2344

PUT(previous, 1, code - previous);

2345

*code = OP_KET;

2346

PUT(code, 1, code - previous);

2347

code += 1 + LINK_SIZE;

2348

}

2349

2350

/* If previous was a character match, abolish the item and generate a

2351

repeat item instead. If a char item has a minumum of more than one, ensure

2352

that it is set in reqbyte - it might not be if a sequence such as x{3} is

2353

the first thing in a branch because the x will have gone into firstbyte

2354

instead. */

2355

2356

if (*previous == OP_CHAR || *previous == OP_CHARNC)

2357

{

2358

/* Deal with UTF-8 characters that take up more than one byte. It's

2359

easier to write this out separately than try to macrify it. Use c to

2360

hold the length of the character in bytes, plus 0x80 to flag that it's a

2361

length rather than a small character. */

2362

2363

#ifdef SUPPORT_UTF8

2364

if (utf8 && (code[-1] & 0x80) != 0)

2365

{

2366

uschar *lastchar = code - 1;

2367

while((*lastchar & 0xc0) == 0x80) lastchar--;

2368

c = code - lastchar; /* Length of UTF-8 character */

2369

memcpy(utf8_char, lastchar, c); /* Save the char */

2370

c |= 0x80; /* Flag c as a length */

2371

}

2372

else

2373

#endif

2374

2375

/* Handle the case of a single byte - either with no UTF8 support, or

2376

with UTF-8 disabled, or for a UTF-8 character < 128. */

2377

2378

{

2379

c = code[-1];

2380

if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;

2381

}

2382

2383

goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */

2384

}

2385

2386

/* If previous was a single negated character ([^a] or similar), we use

2387

one of the special opcodes, replacing it. The code is shared with single-

2388

character repeats by setting opt_type to add a suitable offset into

2389

repeat_type. OP_NOT is currently used only for single-byte chars. */

2390

2391

else if (*previous == OP_NOT)

2392

{

2393

op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */

2394

c = previous[1];

2395

goto OUTPUT_SINGLE_REPEAT;

2396

}

2397

2398

/* If previous was a character type match (\d or similar), abolish it and

2399

create a suitable repeat item. The code is shared with single-character

2400

repeats by setting op_type to add a suitable offset into repeat_type. Note

2401

the the Unicode property types will be present only when SUPPORT_UCP is

2402

defined, but we don't wrap the little bits of code here because it just

2403

makes it horribly messy. */

2404

2405

else if (*previous < OP_EODN)

2406

{

2407

uschar *oldcode;

2408

int prop_type;

2409

op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */

2410

c = *previous;

2411

2412

OUTPUT_SINGLE_REPEAT:

2413

prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?

2414

previous[1] : -1;

2415

2416

oldcode = code;

2417

code = previous; /* Usually overwrite previous item */

2418

2419

/* If the maximum is zero then the minimum must also be zero; Perl allows

2420

this case, so we do too - by simply omitting the item altogether. */

2421

2422

if (repeat_max == 0) goto END_REPEAT;

2423

2424

/* All real repeats make it impossible to handle partial matching (maybe

2425

one day we will be able to remove this restriction). */

2426

2427

if (repeat_max != 1) cd->nopartial = TRUE;

2428

2429

/* Combine the op_type with the repeat_type */

2430

2431

repeat_type += op_type;

2432

2433

/* A minimum of zero is handled either as the special case * or ?, or as

2434

an UPTO, with the maximum given. */

2435

2436

if (repeat_min == 0)

2437

{

2438

if (repeat_max == -1) *code++ = OP_STAR + repeat_type;

2439

else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;

2440

else

2441

{

2442

*code++ = OP_UPTO + repeat_type;

2443

PUT2INC(code, 0, repeat_max);

2444

}

2445

}

2446

2447

/* A repeat minimum of 1 is optimized into some special cases. If the

2448

maximum is unlimited, we use OP_PLUS. Otherwise, the original item it

2449

left in place and, if the maximum is greater than 1, we use OP_UPTO with

2450

one less than the maximum. */

2451

2452

else if (repeat_min == 1)

2453

{

2454

if (repeat_max == -1)

2455

*code++ = OP_PLUS + repeat_type;

2456

else

2457

{

2458

code = oldcode; /* leave previous item in place */

2459

if (repeat_max == 1) goto END_REPEAT;

2460

*code++ = OP_UPTO + repeat_type;

2461

PUT2INC(code, 0, repeat_max - 1);

2462

}

2463

}

2464

2465

/* The case {n,n} is just an EXACT, while the general case {n,m} is

2466

handled as an EXACT followed by an UPTO. */

2467

2468

else

2469

{

2470

*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */

2471

PUT2INC(code, 0, repeat_min);

2472

2473

/* If the maximum is unlimited, insert an OP_STAR. Before doing so,

2474

we have to insert the character for the previous code. For a repeated

2475

Unicode property match, there is an extra byte that defines the

2476

required property. In UTF-8 mode, long characters have their length in

2477

c, with the 0x80 bit as a flag. */

2478

2479

if (repeat_max < 0)

2480

{

2481

#ifdef SUPPORT_UTF8

2482

if (utf8 && c >= 128)

2483

{

2484

memcpy(code, utf8_char, c & 7);

2485

code += c & 7;

2486

}

2487

else

2488

#endif

2489

{

2490

*code++ = c;

2491

if (prop_type >= 0) *code++ = prop_type;

2492

}

2493

*code++ = OP_STAR + repeat_type;

2494

}

2495

2496

/* Else insert an UPTO if the max is greater than the min, again

2497

preceded by the character, for the previously inserted code. */

2498

2499

else if (repeat_max != repeat_min)

2500

{

2501

#ifdef SUPPORT_UTF8

2502

if (utf8 && c >= 128)

2503

{

2504

memcpy(code, utf8_char, c & 7);

2505

code += c & 7;

2506

}

2507

else

2508

#endif

2509

*code++ = c;

2510

if (prop_type >= 0) *code++ = prop_type;

2511

repeat_max -= repeat_min;

2512

*code++ = OP_UPTO + repeat_type;

2513

PUT2INC(code, 0, repeat_max);

2514

}

2515

}

2516

2517

/* The character or character type itself comes last in all cases. */

2518

2519

#ifdef SUPPORT_UTF8

2520

if (utf8 && c >= 128)

2521

{

2522

memcpy(code, utf8_char, c & 7);

2523

code += c & 7;

2524

}

2525

else

2526

#endif

2527

*code++ = c;

2528

2529

/* For a repeated Unicode property match, there is an extra byte that

2530

defines the required property. */

2531

2532

#ifdef SUPPORT_UCP

2533

if (prop_type >= 0) *code++ = prop_type;

2534

#endif

2535

}

2536

2537

/* If previous was a character class or a back reference, we put the repeat

2538

stuff after it, but just skip the item if the repeat was {0,0}. */

2539

2540

else if (*previous == OP_CLASS ||

2541

*previous == OP_NCLASS ||

2542

#ifdef SUPPORT_UTF8

2543

*previous == OP_XCLASS ||

2544

#endif

2545

*previous == OP_REF)

2546

{

2547

if (repeat_max == 0)

2548

{

2549

code = previous;

2550

goto END_REPEAT;

2551

}

2552

2553

/* All real repeats make it impossible to handle partial matching (maybe

2554

one day we will be able to remove this restriction). */

2555

2556

if (repeat_max != 1) cd->nopartial = TRUE;

2557

2558

if (repeat_min == 0 && repeat_max == -1)

2559

*code++ = OP_CRSTAR + repeat_type;

2560

else if (repeat_min == 1 && repeat_max == -1)

2561

*code++ = OP_CRPLUS + repeat_type;

2562

else if (repeat_min == 0 && repeat_max == 1)

2563

*code++ = OP_CRQUERY + repeat_type;

2564

else

2565

{

2566

*code++ = OP_CRRANGE + repeat_type;

2567

PUT2INC(code, 0, repeat_min);

2568

if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */

2569

PUT2INC(code, 0, repeat_max);

2570

}

2571

}

2572

2573

/* If previous was a bracket group, we may have to replicate it in certain

2574

cases. */

2575

2576

else if (*previous >= OP_BRA || *previous == OP_ONCE ||

2577

*previous == OP_COND)

2578

{

2579

2580

int ketoffset = 0;

2581

int len = code - previous;

2582

uschar *bralink = NULL;

2583

2584

/* If the maximum repeat count is unlimited, find the end of the bracket

2585

by scanning through from the start, and compute the offset back to it

2586

from the current code pointer. There may be an OP_OPT setting following

2587

the final KET, so we can't find the end just by going back from the code

2588

pointer. */

2589

2590

if (repeat_max == -1)

2591

{

2592

2593

do ket += GET(ket, 1); while (*ket != OP_KET);

2594

ketoffset = code - ket;

2595

}

2596

2597

/* The case of a zero minimum is special because of the need to stick

2598

OP_BRAZERO in front of it, and because the group appears once in the

2599

data, whereas in other cases it appears the minimum number of times. For

2600

this reason, it is simplest to treat this case separately, as otherwise

2601

the code gets far too messy. There are several special subcases when the

2602

minimum is zero. */

2603

2604

if (repeat_min == 0)

2605

{

2606

/* If the maximum is also zero, we just omit the group from the output

2607

altogether. */

2608

2609

if (repeat_max == 0)

2610

{

2611

code = previous;

2612

goto END_REPEAT;

2613

}

2614

2615

/* If the maximum is 1 or unlimited, we just have to stick in the

2616

BRAZERO and do no more at this point. However, we do need to adjust

2617

any OP_RECURSE calls inside the group that refer to the group itself or

2618

any internal group, because the offset is from the start of the whole

2619

regex. Temporarily terminate the pattern while doing this. */

2620

2621

if (repeat_max <= 1)

2622

{

2623

*code = OP_END;

2624

adjust_recurse(previous, 1, utf8, cd);

2625

memmove(previous+1, previous, len);

2626

code++;

2627

*previous++ = OP_BRAZERO + repeat_type;

2628

}

2629

2630

/* If the maximum is greater than 1 and limited, we have to replicate

2631

in a nested fashion, sticking OP_BRAZERO before each set of brackets.

2632

The first one has to be handled carefully because it's the original

2633

copy, which has to be moved up. The remainder can be handled by code

2634

that is common with the non-zero minimum case below. We have to

2635

adjust the value or repeat_max, since one less copy is required. Once

2636

again, we may have to adjust any OP_RECURSE calls inside the group. */

2637

2638

else

2639

{

2640

int offset;

2641

*code = OP_END;

2642

adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);

2643

memmove(previous + 2 + LINK_SIZE, previous, len);

2644

code += 2 + LINK_SIZE;

2645

*previous++ = OP_BRAZERO + repeat_type;

2646

*previous++ = OP_BRA;

2647

2648

/* We chain together the bracket offset fields that have to be

2649

filled in later when the ends of the brackets are reached. */

2650

2651

offset = (bralink == NULL)? 0 : previous - bralink;

2652

bralink = previous;

2653

PUTINC(previous, 0, offset);

2654

}

2655

2656

repeat_max--;

2657

}

2658

2659

/* If the minimum is greater than zero, replicate the group as many

2660

times as necessary, and adjust the maximum to the number of subsequent

2661

copies that we need. If we set a first char from the group, and didn't

2662

set a required char, copy the latter from the former. */

2663

2664

else

2665

{

2666

if (repeat_min > 1)

2667

{

2668

if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;

2669

for (i = 1; i < repeat_min; i++)

2670

{

2671

memcpy(code, previous, len);

2672

code += len;

2673

}

2674

}

2675

if (repeat_max > 0) repeat_max -= repeat_min;

2676

}

2677

2678

/* This code is common to both the zero and non-zero minimum cases. If

2679

the maximum is limited, it replicates the group in a nested fashion,

2680

remembering the bracket starts on a stack. In the case of a zero minimum,

2681

the first one was set up above. In all cases the repeat_max now specifies

2682

the number of additional copies needed. */

2683

2684

if (repeat_max >= 0)

2685

{

2686

for (i = repeat_max - 1; i >= 0; i--)

2687

{

2688

*code++ = OP_BRAZERO + repeat_type;

2689

2690

/* All but the final copy start a new nesting, maintaining the

2691

chain of brackets outstanding. */

2692

2693

if (i != 0)

2694

{

2695

int offset;

2696

*code++ = OP_BRA;

2697

offset = (bralink == NULL)? 0 : code - bralink;

2698

bralink = code;

2699

PUTINC(code, 0, offset);

2700

}

2701

2702

memcpy(code, previous, len);

2703

code += len;

2704

}

2705

2706

/* Now chain through the pending brackets, and fill in their length

2707

fields (which are holding the chain links pro tem). */

2708

2709

while (bralink != NULL)

2710

{

2711

int oldlinkoffset;

2712

int offset = code - bralink + 1;

2713

uschar *bra = code - offset;

2714

oldlinkoffset = GET(bra, 1);

2715

bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;

2716

*code++ = OP_KET;

2717

PUTINC(code, 0, offset);

2718

PUT(bra, 1, offset);

2719

}

2720

}

2721

2722

/* If the maximum is unlimited, set a repeater in the final copy. We

2723

can't just offset backwards from the current code point, because we

2724

don't know if there's been an options resetting after the ket. The

2725

correct offset was computed above. */

2726

2727

else code[-ketoffset] = OP_KETRMAX + repeat_type;

2728

}

2729

2730

/* Else there's some kind of shambles */

2731

2732

else

2733

{

2734

*errorcodeptr = ERR11;

2735

goto FAILED;

2736

}

2737

2738

/* If the character following a repeat is '+', we wrap the entire repeated

2739

item inside OP_ONCE brackets. This is just syntactic sugar, taken from

2740

Sun's Java package. The repeated item starts at tempcode, not at previous,

2741

which might be the first part of a string whose (former) last char we

2742

repeated. However, we don't support '+' after a greediness '?'. */

2743

2744

if (possessive_quantifier)

2745

{

2746

int len = code - tempcode;

2747

memmove(tempcode + 1+LINK_SIZE, tempcode, len);

2748

code += 1 + LINK_SIZE;

2749

len += 1 + LINK_SIZE;

2750

tempcode[0] = OP_ONCE;

2751

*code++ = OP_KET;

2752

PUTINC(code, 0, len);

2753

PUT(tempcode, 1, len);

2754

}

2755

2756

/* In all case we no longer have a previous item. We also set the

2757

"follows varying string" flag for subsequently encountered reqbytes if

2758

it isn't already set and we have just passed a varying length item. */

2759

2760

END_REPEAT:

2761

previous = NULL;

2762

cd->req_varyopt |= reqvary;

2763

break;

2764

2765

2766

/* Start of nested bracket sub-expression, or comment or lookahead or

2767

lookbehind or option setting or condition. First deal with special things

2768

that can come after a bracket; all are introduced by ?, and the appearance

2769

of any of them means that this is not a referencing group. They were

2770

checked for validity in the first pass over the string, so we don't have to

2771

check for syntax errors here. */

2772

2773

case '(':

2774

newoptions = options;

2775

skipbytes = 0;

2776

2777

if (*(++ptr) == '?')

2778

{

2779

int set, unset;

2780

int *optset;

2781

2782

switch (*(++ptr))

2783

{

2784

case '#': /* Comment; skip to ket */

2785

ptr++;

2786

while (*ptr != ')') ptr++;

2787

continue;

2788

2789

case ':': /* Non-extracting bracket */

2790

bravalue = OP_BRA;

2791

ptr++;

2792

break;

2793

2794

case '(':

2795

bravalue = OP_COND; /* Conditional group */

2796

2797

/* Condition to test for recursion */

2798

2799

if (ptr[1] == 'R')

2800

{

2801

code[1+LINK_SIZE] = OP_CREF;

2802

PUT2(code, 2+LINK_SIZE, CREF_RECURSE);

2803

skipbytes = 3;

2804

ptr += 3;

2805

}

2806

2807

/* Condition to test for a numbered subpattern match. We know that

2808

if a digit follows ( then there will just be digits until ) because

2809

the syntax was checked in the first pass. */

2810

2811

else if ((digitab[ptr[1]] && ctype_digit) != 0)

2812

{

2813

int condref; /* Don't amalgamate; some compilers */

2814

condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */

2815

while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';

2816

if (condref == 0)

2817

{

2818

*errorcodeptr = ERR35;

2819

goto FAILED;

2820

}

2821

ptr++;

2822

code[1+LINK_SIZE] = OP_CREF;

2823

PUT2(code, 2+LINK_SIZE, condref);

2824

skipbytes = 3;

2825

}

2826

/* For conditions that are assertions, we just fall through, having

2827

set bravalue above. */

2828

break;

2829

2830

case '=': /* Positive lookahead */

2831

bravalue = OP_ASSERT;

2832

ptr++;

2833

break;

2834

2835

case '!': /* Negative lookahead */

2836

bravalue = OP_ASSERT_NOT;

2837

ptr++;

2838

break;

2839

2840

case '<': /* Lookbehinds */

2841

switch (*(++ptr))

2842

{

2843

case '=': /* Positive lookbehind */

2844

bravalue = OP_ASSERTBACK;

2845

ptr++;

2846

break;

2847

2848

case '!': /* Negative lookbehind */

2849

bravalue = OP_ASSERTBACK_NOT;

2850

ptr++;

2851

break;

2852

}

2853

break;

2854

2855

case '>': /* One-time brackets */

2856

bravalue = OP_ONCE;

2857

ptr++;

2858

break;

2859

2860

case 'C': /* Callout - may be followed by digits; */

2861

previous_callout = code; /* Save for later completion */

2862

after_manual_callout = 1; /* Skip one item before completing */

2863

*code++ = OP_CALLOUT; /* Already checked that the terminating */

2864

{ /* closing parenthesis is present. */

2865

int n = 0;

2866

while ((digitab[*(++ptr)] & ctype_digit) != 0)

2867

n = n * 10 + *ptr - '0';

2868

if (n > 255)

2869

{

2870

*errorcodeptr = ERR38;

2871

goto FAILED;

2872

}

2873

*code++ = n;

2874

PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */

2875

PUT(code, LINK_SIZE, 0); /* Default length */

2876

code += 2 * LINK_SIZE;

2877

}

2878

previous = NULL;

2879

continue;

2880

2881

case 'P': /* Named subpattern handling */

2882

if (*(++ptr) == '<') /* Definition */

2883

{

2884

int i, namelen;

2885

uschar *slot = cd->name_table;

2886

const uschar *name; /* Don't amalgamate; some compilers */

2887

name = ++ptr; /* grumble at autoincrement in declaration */

2888

2889

while (*ptr++ != '>');

2890

namelen = ptr - name - 1;

2891

2892

for (i = 0; i < cd->names_found; i++)

2893

{

2894

int crc = memcmp(name, slot+2, namelen);

2895

if (crc == 0)

2896

{

2897

if (slot[2+namelen] == 0)

2898

{

2899

*errorcodeptr = ERR43;

2900

goto FAILED;

2901

}

2902

crc = -1; /* Current name is substring */

2903

}

2904

if (crc < 0)

2905

{

2906

memmove(slot + cd->name_entry_size, slot,

2907

(cd->names_found - i) * cd->name_entry_size);

2908

break;

2909

}

2910

slot += cd->name_entry_size;

2911

}

2912

2913

PUT2(slot, 0, *brackets + 1);

2914

memcpy(slot + 2, name, namelen);

2915

slot[2+namelen] = 0;

2916

cd->names_found++;

2917

goto NUMBERED_GROUP;

2918

}

2919

2920

if (*ptr == '=' || *ptr == '>') /* Reference or recursion */

2921

{

2922

int i, namelen;

2923

int type = *ptr++;

2924

const uschar *name = ptr;

2925

uschar *slot = cd->name_table;

2926

2927

while (*ptr != ')') ptr++;

2928

namelen = ptr - name;

2929

2930

for (i = 0; i < cd->names_found; i++)

2931

{

2932

if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;

2933

slot += cd->name_entry_size;

2934

}

2935

if (i >= cd->names_found)

2936

{

2937

*errorcodeptr = ERR15;

2938

goto FAILED;

2939

}

2940

2941

recno = GET2(slot, 0);

2942

2943

if (type == '>') goto HANDLE_RECURSION; /* A few lines below */

2944

2945

/* Back reference */

2946

2947

previous = code;

2948

*code++ = OP_REF;

2949

PUT2INC(code, 0, recno);

2950

cd->backref_map |= (recno < 32)? (1 << recno) : 1;

2951

if (recno > cd->top_backref) cd->top_backref = recno;

2952

continue;

2953

}

2954

2955

/* Should never happen */

2956

break;

2957

2958

case 'R': /* Pattern recursion */

2959

ptr++; /* Same as (?0) */

2960

/* Fall through */

2961

2962

/* Recursion or "subroutine" call */

2963

2964

case '0': case '1': case '2': case '3': case '4':

2965

case '5': case '6': case '7': case '8': case '9':

2966

{

2967

const uschar *called;

2968

recno = 0;

2969

while((digitab[*ptr] & ctype_digit) != 0)

2970

recno = recno * 10 + *ptr++ - '0';

2971

2972

/* Come here from code above that handles a named recursion */

2973

2974

HANDLE_RECURSION:

2975

2976

previous = code;

2977

2978

/* Find the bracket that is being referenced. Temporarily end the

2979

regex in case it doesn't exist. */

2980

2981

*code = OP_END;

2982

called = (recno == 0)?

2983

cd->start_code : find_bracket(cd->start_code, utf8, recno);

2984

2985

if (called == NULL)

2986

{

2987

*errorcodeptr = ERR15;

2988

goto FAILED;

2989

}

2990

2991

/* If the subpattern is still open, this is a recursive call. We

2992

check to see if this is a left recursion that could loop for ever,

2993

and diagnose that case. */

2994

2995

if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))

2996

{

2997

*errorcodeptr = ERR40;

2998

goto FAILED;

2999

}

3000

3001

/* Insert the recursion/subroutine item */

3002

3003

*code = OP_RECURSE;

3004

PUT(code, 1, called - cd->start_code);

3005

code += 1 + LINK_SIZE;

3006

}

3007

continue;

3008

3009

/* Character after (? not specially recognized */

3010

3011

default: /* Option setting */

3012

set = unset = 0;

3013

optset = &set;

3014

3015

while (*ptr != ')' && *ptr != ':')

3016

{

3017

switch (*ptr++)

3018

{

3019

case '-': optset = &unset; break;

3020

3021

case 'i': *optset |= PCRE_CASELESS; break;

3022

case 'm': *optset |= PCRE_MULTILINE; break;

3023

case 's': *optset |= PCRE_DOTALL; break;

3024

case 'x': *optset |= PCRE_EXTENDED; break;

3025

case 'U': *optset |= PCRE_UNGREEDY; break;

3026

case 'X': *optset |= PCRE_EXTRA; break;

3027

}

3028

}

3029

3030

/* Set up the changed option bits, but don't change anything yet. */

3031

3032

newoptions = (options | set) & (~unset);

3033

3034

/* If the options ended with ')' this is not the start of a nested

3035

group with option changes, so the options change at this level. Compile

3036

code to change the ims options if this setting actually changes any of

3037

them. We also pass the new setting back so that it can be put at the

3038

start of any following branches, and when this group ends (if we are in

3039

a group), a resetting item can be compiled.

3040

3041

Note that if this item is right at the start of the pattern, the

3042

options will have been abstracted and made global, so there will be no

3043

change to compile. */

3044

3045

if (*ptr == ')')

3046

{

3047

if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))

3048

{

3049

*code++ = OP_OPT;

3050

*code++ = newoptions & PCRE_IMS;

3051

}

3052

3053

/* Change options at this level, and pass them back for use

3054

in subsequent branches. Reset the greedy defaults and the case

3055

value for firstbyte and reqbyte. */

3056

3057

*optionsptr = options = newoptions;

3058

greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);

3059

greedy_non_default = greedy_default ^ 1;

3060

req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;

3061

3062

previous = NULL; /* This item can't be repeated */

3063

continue; /* It is complete */

3064

}

3065

3066

/* If the options ended with ':' we are heading into a nested group

3067

with possible change of options. Such groups are non-capturing and are

3068

not assertions of any kind. All we need to do is skip over the ':';

3069

the newoptions value is handled below. */

3070

3071

bravalue = OP_BRA;

3072

ptr++;

3073

}

3074

}

3075

3076

/* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become

3077

non-capturing and behave like (?:...) brackets */

3078

3079

else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)

3080

{

3081

bravalue = OP_BRA;

3082

}

3083

3084

/* Else we have a referencing group; adjust the opcode. If the bracket

3085

number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and

3086

arrange for the true number to follow later, in an OP_BRANUMBER item. */

3087

3088

else

3089

{

3090

NUMBERED_GROUP:

3091

if (++(*brackets) > EXTRACT_BASIC_MAX)

3092

{

3093

bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;

3094

code[1+LINK_SIZE] = OP_BRANUMBER;

3095

PUT2(code, 2+LINK_SIZE, *brackets);

3096

skipbytes = 3;

3097

}

3098

else bravalue = OP_BRA + *brackets;

3099

}

3100

3101

/* Process nested bracketed re. Assertions may not be repeated, but other

3102

kinds can be. We copy code into a non-register variable in order to be able

3103

to pass its address because some compilers complain otherwise. Pass in a

3104

new setting for the ims options if they have changed. */

3105

3106

previous = (bravalue >= OP_ONCE)? code : NULL;

3107

*code = bravalue;

3108

tempcode = code;

3109

tempreqvary = cd->req_varyopt; /* Save value before bracket */

3110

3111

if (!compile_regex(

3112

newoptions, /* The complete new option state */

3113

options & PCRE_IMS, /* The previous ims option state */

3114

brackets, /* Extracting bracket count */

3115

&tempcode, /* Where to put code (updated) */

3116

&ptr, /* Input pointer (updated) */

3117

errorcodeptr, /* Where to put an error message */

3118

(bravalue == OP_ASSERTBACK ||

3119

bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */

3120

skipbytes, /* Skip over OP_COND/OP_BRANUMBER */

3121

&subfirstbyte, /* For possible first char */

3122

&subreqbyte, /* For possible last char */

3123

bcptr, /* Current branch chain */

3124

cd)) /* Tables block */

3125

goto FAILED;

3126

3127

/* At the end of compiling, code is still pointing to the start of the

3128

group, while tempcode has been updated to point past the end of the group

3129

and any option resetting that may follow it. The pattern pointer (ptr)

3130

is on the bracket. */

3131

3132

/* If this is a conditional bracket, check that there are no more than

3133

two branches in the group. */

3134

3135

else if (bravalue == OP_COND)

3136

{

3137

uschar *tc = code;

3138

condcount = 0;

3139

3140

do {

3141

condcount++;

3142

tc += GET(tc,1);

3143

}

3144

while (*tc != OP_KET);

3145

3146

if (condcount > 2)

3147

{

3148

*errorcodeptr = ERR27;

3149

goto FAILED;

3150

}

3151

3152

/* If there is just one branch, we must not make use of its firstbyte or

3153

reqbyte, because this is equivalent to an empty second branch. */

3154

3155

if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;

3156

}

3157

3158

/* Handle updating of the required and first characters. Update for normal

3159

brackets of all kinds, and conditions with two branches (see code above).

3160

If the bracket is followed by a quantifier with zero repeat, we have to

3161

back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the

3162

main loop so that they can be accessed for the back off. */

3163

3164

zeroreqbyte = reqbyte;

3165

zerofirstbyte = firstbyte;

3166

groupsetfirstbyte = FALSE;

3167

3168

if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)

3169

{

3170

/* If we have not yet set a firstbyte in this branch, take it from the

3171

subpattern, remembering that it was set here so that a repeat of more

3172

than one can replicate it as reqbyte if necessary. If the subpattern has

3173

no firstbyte, set "none" for the whole branch. In both cases, a zero

3174

repeat forces firstbyte to "none". */

3175

3176

if (firstbyte == REQ_UNSET)

3177

{

3178

if (subfirstbyte >= 0)

3179

{

3180

firstbyte = subfirstbyte;

3181

groupsetfirstbyte = TRUE;

3182

}

3183

else firstbyte = REQ_NONE;

3184

zerofirstbyte = REQ_NONE;

3185

}

3186

3187

/* If firstbyte was previously set, convert the subpattern's firstbyte

3188

into reqbyte if there wasn't one, using the vary flag that was in

3189

existence beforehand. */

3190

3191

else if (subfirstbyte >= 0 && subreqbyte < 0)

3192

subreqbyte = subfirstbyte | tempreqvary;

3193

3194

/* If the subpattern set a required byte (or set a first byte that isn't

3195

really the first byte - see above), set it. */

3196

3197

if (subreqbyte >= 0) reqbyte = subreqbyte;

3198

}

3199

3200

/* For a forward assertion, we take the reqbyte, if set. This can be

3201

helpful if the pattern that follows the assertion doesn't set a different

3202

char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte

3203

for an assertion, however because it leads to incorrect effect for patterns

3204

such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead

3205

of a firstbyte. This is overcome by a scan at the end if there's no

3206

firstbyte, looking for an asserted first char. */

3207

3208

else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;

3209

3210

/* Now update the main code pointer to the end of the group. */

3211

3212

code = tempcode;

3213

3214

/* Error if hit end of pattern */

3215

3216

if (*ptr != ')')

3217

{

3218

*errorcodeptr = ERR14;

3219

goto FAILED;

3220

}

3221

break;

3222

3223

/* Check \ for being a real metacharacter; if not, fall through and handle

3224

it as a data character at the start of a string. Escape items are checked

3225

for validity in the pre-compiling pass. */

3226

3227

case '\\':

3228

tempptr = ptr;

3229

c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);

3230

3231

/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values

3232

are arranged to be the negation of the corresponding OP_values. For the

3233

back references, the values are ESC_REF plus the reference number. Only

3234

back references and those types that consume a character may be repeated.

3235

We can test for values between ESC_b and ESC_Z for the latter; this may

3236

have to change if any new ones are ever created. */

3237

3238

if (c < 0)

3239

{

3240

if (-c == ESC_Q) /* Handle start of quoted string */

3241

{

3242

if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */

3243

else inescq = TRUE;

3244

continue;

3245

}

3246

3247

/* For metasequences that actually match a character, we disable the

3248

setting of a first character if it hasn't already been set. */

3249

3250

if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)

3251

firstbyte = REQ_NONE;

3252

3253

/* Set values to reset to if this is followed by a zero repeat. */

3254

3255

zerofirstbyte = firstbyte;

3256

zeroreqbyte = reqbyte;

3257

3258

/* Back references are handled specially */

3259

3260

if (-c >= ESC_REF)

3261

{

3262

int number = -c - ESC_REF;

3263

previous = code;

3264

*code++ = OP_REF;

3265

PUT2INC(code, 0, number);

3266

}

3267

3268

/* So are Unicode property matches, if supported. We know that get_ucp

3269

won't fail because it was tested in the pre-pass. */

3270

3271

#ifdef SUPPORT_UCP

3272

else if (-c == ESC_P || -c == ESC_p)

3273

{

3274

BOOL negated;

3275

int value = get_ucp(&ptr, &negated, errorcodeptr);

3276

previous = code;

3277

*code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;

3278

*code++ = value;

3279

}

3280

#endif

3281

3282

/* For the rest, we can obtain the OP value by negating the escape

3283

value */

3284

3285

else

3286

{

3287

previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;

3288

*code++ = -c;

3289

}

3290

continue;

3291

}

3292

3293

/* We have a data character whose value is in c. In UTF-8 mode it may have

3294

a value > 127. We set its representation in the length/buffer, and then

3295

handle it as a data character. */

3296

3297

#ifdef SUPPORT_UTF8

3298

if (utf8 && c > 127)

3299

mclength = _pcre_ord2utf8(c, mcbuffer);

3300

else

3301

#endif

3302

3303

{

3304

mcbuffer[0] = c;

3305

mclength = 1;

3306

}

3307

3308

goto ONE_CHAR;

3309

3310

/* Handle a literal character. It is guaranteed not to be whitespace or #

3311

when the extended flag is set. If we are in UTF-8 mode, it may be a

3312

multi-byte literal character. */

3313

3314

default:

3315

NORMAL_CHAR:

3316

mclength = 1;

3317

mcbuffer[0] = c;

3318

3319

#ifdef SUPPORT_UTF8

3320

if (utf8 && (c & 0xc0) == 0xc0)

3321

{

3322

while ((ptr[1] & 0xc0) == 0x80)

3323

mcbuffer[mclength++] = *(++ptr);

3324

}

3325

#endif

3326

3327

/* At this point we have the character's bytes in mcbuffer, and the length

3328

in mclength. When not in UTF-8 mode, the length is always 1. */

3329

3330

ONE_CHAR:

3331

previous = code;

3332

*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;

3333

for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];

3334

3335

/* Set the first and required bytes appropriately. If no previous first

3336

byte, set it from this character, but revert to none on a zero repeat.

3337

Otherwise, leave the firstbyte value alone, and don't change it on a zero

3338

repeat. */

3339

3340

if (firstbyte == REQ_UNSET)

3341

{

3342

zerofirstbyte = REQ_NONE;

3343

zeroreqbyte = reqbyte;

3344

3345

/* If the character is more than one byte long, we can set firstbyte

3346

only if it is not to be matched caselessly. */

3347

3348

if (mclength == 1 || req_caseopt == 0)

3349

{

3350

firstbyte = mcbuffer[0] | req_caseopt;

3351

if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;

3352

}

3353

else firstbyte = reqbyte = REQ_NONE;

3354

}

3355

3356

/* firstbyte was previously set; we can set reqbyte only the length is

3357

1 or the matching is caseful. */

3358

3359

else

3360

{

3361

zerofirstbyte = firstbyte;

3362

zeroreqbyte = reqbyte;

3363

if (mclength == 1 || req_caseopt == 0)

3364

reqbyte = code[-1] | req_caseopt | cd->req_varyopt;

3365

}

3366

3367

break; /* End of literal character handling */

3368

}

3369

} /* end of big loop */

3370

3371

/* Control never reaches here by falling through, only by a goto for all the

3372

error states. Pass back the position in the pattern so that it can be displayed

3373

to the user for diagnosing the error. */

3374

3375

FAILED:

3376

*ptrptr = ptr;

3377

return FALSE;

3378

}

3379

3380

3381

3382

3383

/*************************************************

3384

* Compile sequence of alternatives *

3385

*************************************************/

3386

3387

/* On entry, ptr is pointing past the bracket character, but on return

3388

it points to the closing bracket, or vertical bar, or end of string.

3389

The code variable is pointing at the byte into which the BRA operator has been

3390

stored. If the ims options are changed at the start (for a (?ims: group) or

3391

during any branch, we need to insert an OP_OPT item at the start of every

3392

following branch to ensure they get set correctly at run time, and also pass

3393

the new options into every subsequent branch compile.

3394

3395

Argument:

3396

options option bits, including any changes for this subpattern

3397

oldims previous settings of ims option bits

3398

brackets -> int containing the number of extracting brackets used

3399

codeptr -> the address of the current code pointer

3400

ptrptr -> the address of the current pattern pointer

3401

errorcodeptr -> pointer to error code variable

3402

lookbehind TRUE if this is a lookbehind assertion

3403

skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)

3404

firstbyteptr place to put the first required character, or a negative number

3405

reqbyteptr place to put the last required character, or a negative number

3406

bcptr pointer to the chain of currently open branches

3407

cd points to the data block with tables pointers etc.

3408

3409

Returns: TRUE on success

3410

3411

3412

static BOOL

3413

compile_regex(int options, int oldims, int *brackets, uschar **codeptr,

3414

const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,

3415

int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)

3416

{

3417

const uschar *ptr = *ptrptr;

3418

uschar *code = *codeptr;

3419

uschar *last_branch = code;

3420

uschar *start_bracket = code;

3421

uschar *reverse_count = NULL;

3422

int firstbyte, reqbyte;

3423

int branchfirstbyte, branchreqbyte;

3424

branch_chain bc;

3425

3426

bc.outer = bcptr;

3427

bc.current = code;

3428

3429

firstbyte = reqbyte = REQ_UNSET;

3430

3431

/* Offset is set zero to mark that this bracket is still open */

3432

3433

PUT(code, 1, 0);

3434

code += 1 + LINK_SIZE + skipbytes;

3435

3436

/* Loop for each alternative branch */

3437

3438

for (;;)

3439

{

3440

/* Handle a change of ims options at the start of the branch */

3441

3442

if ((options & PCRE_IMS) != oldims)

3443

{

3444

*code++ = OP_OPT;

3445

*code++ = options & PCRE_IMS;

3446

}

3447

3448

/* Set up dummy OP_REVERSE if lookbehind assertion */

3449

3450

if (lookbehind)

3451

{

3452

*code++ = OP_REVERSE;

3453

reverse_count = code;

3454

PUTINC(code, 0, 0);

3455

}

3456

3457

/* Now compile the branch */

3458

3459

if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,

3460

&branchfirstbyte, &branchreqbyte, &bc, cd))

3461

{

3462

*ptrptr = ptr;

3463

return FALSE;

3464

}

3465

3466

/* If this is the first branch, the firstbyte and reqbyte values for the

3467

branch become the values for the regex. */

3468

3469

if (*last_branch != OP_ALT)

3470

{

3471

firstbyte = branchfirstbyte;

3472

reqbyte = branchreqbyte;

3473

}

3474

3475

/* If this is not the first branch, the first char and reqbyte have to

3476

match the values from all the previous branches, except that if the previous

3477

value for reqbyte didn't have REQ_VARY set, it can still match, and we set

3478

REQ_VARY for the regex. */

3479

3480

else

3481

{

3482

/* If we previously had a firstbyte, but it doesn't match the new branch,

3483

we have to abandon the firstbyte for the regex, but if there was previously

3484

no reqbyte, it takes on the value of the old firstbyte. */

3485

3486

if (firstbyte >= 0 && firstbyte != branchfirstbyte)

3487

{

3488

if (reqbyte < 0) reqbyte = firstbyte;

3489

firstbyte = REQ_NONE;

3490

}

3491

3492

/* If we (now or from before) have no firstbyte, a firstbyte from the

3493

branch becomes a reqbyte if there isn't a branch reqbyte. */

3494

3495

if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)

3496

branchreqbyte = branchfirstbyte;

3497

3498

/* Now ensure that the reqbytes match */

3499

3500

if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))

3501

reqbyte = REQ_NONE;

3502

else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */

3503

}

3504

3505

/* If lookbehind, check that this branch matches a fixed-length string,

3506

and put the length into the OP_REVERSE item. Temporarily mark the end of

3507

the branch with OP_END. */

3508

3509

if (lookbehind)

3510

{

3511

int length;

3512

*code = OP_END;

3513

length = find_fixedlength(last_branch, options);

3514

DPRINTF(("fixed length = %d\n", length));

3515

if (length < 0)

3516

{

3517

*errorcodeptr = (length == -2)? ERR36 : ERR25;

3518

*ptrptr = ptr;

3519

return FALSE;

3520

}

3521

PUT(reverse_count, 0, length);

3522

}

3523

3524

/* Reached end of expression, either ')' or end of pattern. Go back through

3525

the alternative branches and reverse the chain of offsets, with the field in

3526

the BRA item now becoming an offset to the first alternative. If there are

3527

no alternatives, it points to the end of the group. The length in the

3528

terminating ket is always the length of the whole bracketed item. If any of

3529

the ims options were changed inside the group, compile a resetting op-code

3530

following, except at the very end of the pattern. Return leaving the pointer

3531

at the terminating char. */

3532

3533

if (*ptr != '|')

3534

{

3535

int length = code - last_branch;

3536

3537

{

3538

int prev_length = GET(last_branch, 1);

3539

PUT(last_branch, 1, length);

3540

length = prev_length;

3541

last_branch -= length;

3542

}

3543

while (length > 0);

3544

3545

/* Fill in the ket */

3546

3547

*code = OP_KET;

3548

PUT(code, 1, code - start_bracket);

3549

code += 1 + LINK_SIZE;

3550

3551

/* Resetting option if needed */

3552

3553

if ((options & PCRE_IMS) != oldims && *ptr == ')')

3554

{

3555

*code++ = OP_OPT;

3556

*code++ = oldims;

3557

}

3558

3559

/* Set values to pass back */

3560

3561

*codeptr = code;

3562

*ptrptr = ptr;

3563

*firstbyteptr = firstbyte;

3564

*reqbyteptr = reqbyte;

3565

return TRUE;

3566

}

3567

3568

/* Another branch follows; insert an "or" node. Its length field points back

3569

to the previous branch while the bracket remains open. At the end the chain

3570

is reversed. It's done like this so that the start of the bracket has a

3571

zero offset until it is closed, making it possible to detect recursion. */

3572

3573

*code = OP_ALT;

3574

PUT(code, 1, code - last_branch);

3575

bc.current = last_branch = code;

3576

code += 1 + LINK_SIZE;

3577

ptr++;

3578

}

3579

/* Control never reaches here */

3580

}

3581

3582

3583

3584

3585

/*************************************************

3586

* Check for anchored expression *

3587

*************************************************/

3588

3589

/* Try to find out if this is an anchored regular expression. Consider each

3590

alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket

3591

all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then

3592

it's anchored. However, if this is a multiline pattern, then only OP_SOD

3593

counts, since OP_CIRC can match in the middle.

3594

3595

We can also consider a regex to be anchored if OP_SOM starts all its branches.

3596

This is the code for \G, which means "match at start of match position, taking

3597

into account the match offset".

3598

3599

A branch is also implicitly anchored if it starts with .* and DOTALL is set,

3600

because that will try the rest of the pattern at all possible matching points,

3601

so there is no point trying again.... er ....

3602

3603

.... except when the .* appears inside capturing parentheses, and there is a

3604

subsequent back reference to those parentheses. We haven't enough information

3605

to catch that case precisely.

3606

3607

At first, the best we could do was to detect when .* was in capturing brackets

3608

and the highest back reference was greater than or equal to that level.

3609

However, by keeping a bitmap of the first 31 back references, we can catch some

3610

of the more common cases more precisely.

3611

3612

Arguments:

3613

code points to start of expression (the bracket)

3614

options points to the options setting

3615

bracket_map a bitmap of which brackets we are inside while testing; this

3616

handles up to substring 31; after that we just have to take

3617

the less precise approach

3618

backref_map the back reference bitmap

3619

3620

Returns: TRUE or FALSE

3621

3622

3623

static BOOL

3624

is_anchored(register const uschar *code, int *options, unsigned int bracket_map,

3625

unsigned int backref_map)

3626

{

3627

do {

3628

const uschar *scode =

3629

first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);

3630

3631

3632

/* Capturing brackets */

3633

3634

if (op > OP_BRA)

3635

{

3636

int new_map;

3637

op -= OP_BRA;

3638

if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);

3639

new_map = bracket_map | ((op < 32)? (1 << op) : 1);

3640

if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;

3641

}

3642

3643

/* Other brackets */

3644

3645

else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)

3646

{

3647

if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;

3648

}

3649

3650

/* .* is not anchored unless DOTALL is set and it isn't in brackets that

3651

are or may be referenced. */

3652

3653

else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&

3654

(*options & PCRE_DOTALL) != 0)

3655

{

3656

if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;

3657

}

3658

3659

/* Check for explicit anchoring */

3660

3661

else if (op != OP_SOD && op != OP_SOM &&

3662

((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))

3663

return FALSE;

3664

code += GET(code, 1);

3665

}

3666

while (*code == OP_ALT); /* Loop for each alternative */

3667

return TRUE;

3668

}

3669

3670

3671

3672

/*************************************************

3673

* Check for starting with ^ or .* *

3674

*************************************************/

3675

3676

/* This is called to find out if every branch starts with ^ or .* so that

3677

"first char" processing can be done to speed things up in multiline

3678

matching and for non-DOTALL patterns that start with .* (which must start at

3679

the beginning or after \n). As in the case of is_anchored() (see above), we

3680

have to take account of back references to capturing brackets that contain .*

3681

because in that case we can't make the assumption.

3682

3683

Arguments:

3684

code points to start of expression (the bracket)

3685

bracket_map a bitmap of which brackets we are inside while testing; this

3686

handles up to substring 31; after that we just have to take

3687

the less precise approach

3688

backref_map the back reference bitmap

3689

3690

Returns: TRUE or FALSE

3691

3692

3693

static BOOL

3694

is_startline(const uschar *code, unsigned int bracket_map,

3695

unsigned int backref_map)

3696

{

3697

do {

3698

const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,

3699

FALSE);

3700

3701

3702

/* Capturing brackets */

3703

3704

if (op > OP_BRA)

3705

{

3706

int new_map;

3707

op -= OP_BRA;

3708

if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);

3709

new_map = bracket_map | ((op < 32)? (1 << op) : 1);

3710

if (!is_startline(scode, new_map, backref_map)) return FALSE;

3711

}

3712

3713

/* Other brackets */

3714

3715

else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)

3716

{ if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }

3717

3718

/* .* means "start at start or after \n" if it isn't in brackets that

3719

may be referenced. */

3720

3721

else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)

3722

{

3723

if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;

3724

}

3725

3726

/* Check for explicit circumflex */

3727

3728

else if (op != OP_CIRC) return FALSE;

3729

3730

/* Move on to the next alternative */

3731

3732

code += GET(code, 1);

3733

}

3734

while (*code == OP_ALT); /* Loop for each alternative */

3735

return TRUE;

3736

}

3737

3738

3739

3740

/*************************************************

3741

* Check for asserted fixed first char *

3742

*************************************************/

3743

3744

/* During compilation, the "first char" settings from forward assertions are

3745

discarded, because they can cause conflicts with actual literals that follow.

3746

However, if we end up without a first char setting for an unanchored pattern,

3747

it is worth scanning the regex to see if there is an initial asserted first

3748

char. If all branches start with the same asserted char, or with a bracket all

3749

of whose alternatives start with the same asserted char (recurse ad lib), then

3750

we return that char, otherwise -1.

3751

3752

Arguments:

3753

code points to start of expression (the bracket)

3754

options pointer to the options (used to check casing changes)

3755

inassert TRUE if in an assertion

3756

3757

Returns: -1 or the fixed first char

3758

3759

3760

static int

3761

find_firstassertedchar(const uschar *code, int *options, BOOL inassert)

3762

{

3763

3764

do {

3765

int d;

3766

const uschar *scode =

3767

first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);

3768

3769

3770

if (op >= OP_BRA) op = OP_BRA;

3771

3772

switch(op)

3773

{

3774

default:

3775

return -1;

3776

3777

case OP_BRA:

3778

case OP_ASSERT:

3779

case OP_ONCE:

3780

case OP_COND:

3781

if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)

3782

return -1;

3783

if (c < 0) c = d; else if (c != d) return -1;

3784

break;

3785

3786

case OP_EXACT: /* Fall through */

3787

scode += 2;

3788

3789

case OP_CHAR:

3790

case OP_CHARNC:

3791

case OP_PLUS:

3792

case OP_MINPLUS:

3793

if (!inassert) return -1;

3794

if (c < 0)

3795

{

3796

c = scode[1];

3797

if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;

3798

}

3799

else if (c != scode[1]) return -1;

3800

break;

3801

}

3802

3803

code += GET(code, 1);

3804

}

3805

while (*code == OP_ALT);

3806

return c;

3807

}

3808

3809

3810

3811

/*************************************************

3812

* Compile a Regular Expression *

3813

*************************************************/

3814

3815

/* This function takes a string and returns a pointer to a block of store

3816

holding a compiled version of the expression. The original API for this

3817

function had no error code return variable; it is retained for backwards

3818

compatibility. The new function is given a new name.

3819

3820

Arguments:

3821

pattern the regular expression

3822

options various option bits

3823

errorcodeptr pointer to error code variable (pcre_compile2() only)

3824

can be NULL if you don't want a code value

3825

errorptr pointer to pointer to error text

3826

erroroffset ptr offset in pattern where error was detected

3827

tables pointer to character tables or NULL

3828

3829

Returns: pointer to compiled data block, or NULL on error,

3830

with errorptr and erroroffset set

3831

3832

3833

EXPORT pcre *

3834

pcre_compile(const char *pattern, int options, const char **errorptr,

3835

int *erroroffset, const unsigned char *tables)

3836

{

3837

return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);

3838

}

3839

3840

3841

EXPORT pcre *

3842

pcre_compile2(const char *pattern, int options, int *errorcodeptr,

3843

const char **errorptr, int *erroroffset, const unsigned char *tables)

3844

{

3845

real_pcre *re;

3846

int length = 1 + LINK_SIZE; /* For initial BRA plus length */

3847

int c, firstbyte, reqbyte;

3848

int bracount = 0;

3849

int branch_extra = 0;

3850

int branch_newextra;

3851

int item_count = -1;

3852

int name_count = 0;

3853

int max_name_size = 0;

3854

int lastitemlength = 0;

3855

int errorcode = 0;

3856

#ifdef SUPPORT_UTF8

3857

BOOL utf8;

3858

BOOL class_utf8;

3859

#endif

3860

BOOL inescq = FALSE;

3861

unsigned int brastackptr = 0;

3862

size_t size;

3863

uschar *code;

3864

const uschar *codestart;

3865

const uschar *ptr;

3866

compile_data compile_block;

3867

int brastack[BRASTACK_SIZE];

3868

uschar bralenstack[BRASTACK_SIZE];

3869

3870

/* We can't pass back an error message if errorptr is NULL; I guess the best we

3871

can do is just return NULL, but we can set a code value if there is a code

3872

pointer. */

3873

3874

if (errorptr == NULL)

3875

{

3876

if (errorcodeptr != NULL) *errorcodeptr = 99;

3877

return NULL;

3878

}

3879

3880

*errorptr = NULL;

3881

if (errorcodeptr != NULL) *errorcodeptr = ERR0;

3882

3883

/* However, we can give a message for this error */

3884

3885

if (erroroffset == NULL)

3886

{

3887

errorcode = ERR16;

3888

goto PCRE_EARLY_ERROR_RETURN;

3889

}

3890

3891

*erroroffset = 0;

3892

3893

/* Can't support UTF8 unless PCRE has been compiled to include the code. */

3894

3895

#ifdef SUPPORT_UTF8

3896

utf8 = (options & PCRE_UTF8) != 0;

3897

if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&

3898

(*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)

3899

{

3900

errorcode = ERR44;

3901

goto PCRE_EARLY_ERROR_RETURN;

3902

}

3903

#else

3904

if ((options & PCRE_UTF8) != 0)

3905

{

3906

errorcode = ERR32;

3907

goto PCRE_EARLY_ERROR_RETURN;

3908

}

3909

#endif

3910

3911

if ((options & ~PUBLIC_OPTIONS) != 0)

3912

{

3913

errorcode = ERR17;

3914

goto PCRE_EARLY_ERROR_RETURN;

3915

}

3916

3917

/* Set up pointers to the individual character tables */

3918

3919

if (tables == NULL) tables = _pcre_default_tables;

3920

compile_block.lcc = tables + lcc_offset;

3921

compile_block.fcc = tables + fcc_offset;

3922

compile_block.cbits = tables + cbits_offset;

3923

compile_block.ctypes = tables + ctypes_offset;

3924

3925

/* Maximum back reference and backref bitmap. This is updated for numeric

3926

references during the first pass, but for named references during the actual

3927

compile pass. The bitmap records up to 31 back references to help in deciding

3928

whether (.*) can be treated as anchored or not. */

3929

3930

compile_block.top_backref = 0;

3931

compile_block.backref_map = 0;

3932

3933

/* Reflect pattern for debugging output */

3934

3935

DPRINTF(("------------------------------------------------------------------\n"));

3936

DPRINTF(("%s\n", pattern));

3937

3938

/* The first thing to do is to make a pass over the pattern to compute the

3939

amount of store required to hold the compiled code. This does not have to be

3940

perfect as long as errors are overestimates. At the same time we can detect any

3941

flag settings right at the start, and extract them. Make an attempt to correct

3942

for any counted white space if an "extended" flag setting appears late in the

3943

pattern. We can't be so clever for #-comments. */

3944

3945

ptr = (const uschar *)(pattern - 1);

3946

while ((c = *(++ptr)) != 0)

3947

{

3948

int min, max;

3949

int class_optcount;

3950

int bracket_length;

3951

int duplength;

3952

3953

/* If we are inside a \Q...\E sequence, all chars are literal */

3954

3955

if (inescq)

3956

{

3957

if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;

3958

goto NORMAL_CHAR;

3959

}

3960

3961

/* Otherwise, first check for ignored whitespace and comments */

3962

3963

if ((options & PCRE_EXTENDED) != 0)

3964

{

3965

if ((compile_block.ctypes[c] & ctype_space) != 0) continue;

3966

if (c == '#')

3967

{

3968

/* The space before the ; is to avoid a warning on a silly compiler

3969

on the Macintosh. */

3970

while ((c = *(++ptr)) != 0 && c != NEWLINE) ;

3971

if (c == 0) break;

3972

continue;

3973

}

3974

}

3975

3976

item_count++; /* Is zero for the first non-comment item */

3977

3978

/* Allow space for auto callout before every item except quantifiers. */

3979

3980

if ((options & PCRE_AUTO_CALLOUT) != 0 &&

3981

c != '*' && c != '+' && c != '?' &&

3982

(c != '{' || !is_counted_repeat(ptr + 1)))

3983

length += 2 + 2*LINK_SIZE;

3984

3985

switch(c)

3986

{

3987

/* A backslashed item may be an escaped data character or it may be a

3988

character type. */

3989

3990

case '\\':

3991

c = check_escape(&ptr, &errorcode, bracount, options, FALSE);

3992

if (errorcode != 0) goto PCRE_ERROR_RETURN;

3993

3994

lastitemlength = 1; /* Default length of last item for repeats */

3995

3996

if (c >= 0) /* Data character */

3997

{

3998

length += 2; /* For a one-byte character */

3999

4000

#ifdef SUPPORT_UTF8

4001

if (utf8 && c > 127)

4002

{

4003

int i;

4004

for (i = 0; i < _pcre_utf8_table1_size; i++)

4005

if (c <= _pcre_utf8_table1[i]) break;

4006

length += i;

4007

lastitemlength += i;

4008

}

4009

#endif

4010

4011

continue;

4012

}

4013

4014

/* If \Q, enter "literal" mode */

4015

4016

if (-c == ESC_Q)

4017

{

4018

inescq = TRUE;

4019

continue;

4020

}

4021

4022

/* \X is supported only if Unicode property support is compiled */

4023

4024

#ifndef SUPPORT_UCP

4025

if (-c == ESC_X)

4026

{

4027

errorcode = ERR45;

4028

goto PCRE_ERROR_RETURN;

4029

}

4030

#endif

4031

4032

/* \P and \p are for Unicode properties, but only when the support has

4033

been compiled. Each item needs 2 bytes. */

4034

4035

else if (-c == ESC_P || -c == ESC_p)

4036

{

4037

#ifdef SUPPORT_UCP

4038

BOOL negated;

4039

length += 2;

4040

lastitemlength = 2;

4041

if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;

4042

continue;

4043

#else

4044

errorcode = ERR45;

4045

goto PCRE_ERROR_RETURN;

4046

#endif

4047

}

4048

4049

/* Other escapes need one byte */

4050

4051

length++;

4052

4053

/* A back reference needs an additional 2 bytes, plus either one or 5

4054

bytes for a repeat. We also need to keep the value of the highest

4055

back reference. */

4056

4057

if (c <= -ESC_REF)

4058

{

4059

int refnum = -c - ESC_REF;

4060

compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;

4061

if (refnum > compile_block.top_backref)

4062

compile_block.top_backref = refnum;

4063

length += 2; /* For single back reference */

4064

if (ptr[1] == '{' && is_counted_repeat(ptr+2))

4065

{

4066

ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);

4067

if (errorcode != 0) goto PCRE_ERROR_RETURN;

4068

if ((min == 0 && (max == 1 || max == -1)) ||

4069

(min == 1 && max == -1))

4070

length++;

4071

else length += 5;

4072

if (ptr[1] == '?') ptr++;

4073

}

4074

}

4075

continue;

4076

4077

case '^': /* Single-byte metacharacters */

4078

case '.':

4079

case '$':

4080

length++;

4081

lastitemlength = 1;

4082

continue;

4083

4084

case '*': /* These repeats won't be after brackets; */

4085

case '+': /* those are handled separately */

4086

case '?':

4087

length++;

4088

goto POSESSIVE; /* A few lines below */

4089

4090

/* This covers the cases of braced repeats after a single char, metachar,

4091

class, or back reference. */

4092

4093

case '{':

4094

if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;

4095

ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);

4096

if (errorcode != 0) goto PCRE_ERROR_RETURN;

4097

4098

/* These special cases just insert one extra opcode */

4099

4100

if ((min == 0 && (max == 1 || max == -1)) ||

4101

(min == 1 && max == -1))

4102

length++;

4103

4104

/* These cases might insert additional copies of a preceding character. */

4105

4106

else

4107

{

4108

if (min != 1)

4109

{

4110

length -= lastitemlength; /* Uncount the original char or metachar */

4111

if (min > 0) length += 3 + lastitemlength;

4112

}

4113

length += lastitemlength + ((max > 0)? 3 : 1);

4114

}

4115

4116

if (ptr[1] == '?') ptr++; /* Needs no extra length */

4117

4118

POSESSIVE: /* Test for possessive quantifier */

4119

if (ptr[1] == '+')

4120

{

4121

ptr++;

4122

length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */

4123

}

4124

continue;

4125

4126

/* An alternation contains an offset to the next branch or ket. If any ims

4127

options changed in the previous branch(es), and/or if we are in a

4128

lookbehind assertion, extra space will be needed at the start of the

4129

branch. This is handled by branch_extra. */

4130

4131

case '|':

4132

length += 1 + LINK_SIZE + branch_extra;

4133

continue;

4134

4135

/* A character class uses 33 characters provided that all the character

4136

values are less than 256. Otherwise, it uses a bit map for low valued

4137

characters, and individual items for others. Don't worry about character

4138

types that aren't allowed in classes - they'll get picked up during the

4139

compile. A character class that contains only one single-byte character

4140

uses 2 or 3 bytes, depending on whether it is negated or not. Notice this

4141

where we can. (In UTF-8 mode we can do this only for chars < 128.) */

4142

4143

case '[':

4144

if (*(++ptr) == '^')

4145

{

4146

class_optcount = 10; /* Greater than one */

4147

ptr++;

4148

}

4149

else class_optcount = 0;

4150

4151

#ifdef SUPPORT_UTF8

4152

class_utf8 = FALSE;

4153

#endif

4154

4155

/* Written as a "do" so that an initial ']' is taken as data */

4156

4157

if (*ptr != 0) do

4158

{

4159

/* Inside \Q...\E everything is literal except \E */

4160

4161

if (inescq)

4162

{

4163

if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;

4164

inescq = FALSE;

4165

ptr += 1;

4166

continue;

4167

}

4168

4169

/* Outside \Q...\E, check for escapes */

4170

4171

if (*ptr == '\\')

4172

{

4173

c = check_escape(&ptr, &errorcode, bracount, options, TRUE);

4174

if (errorcode != 0) goto PCRE_ERROR_RETURN;

4175

4176

/* \b is backspace inside a class; \X is literal */

4177

4178

if (-c == ESC_b) c = '\b';

4179

else if (-c == ESC_X) c = 'X';

4180

4181

/* \Q enters quoting mode */

4182

4183

else if (-c == ESC_Q)

4184

{

4185

inescq = TRUE;

4186

continue;

4187

}

4188

4189

/* Handle escapes that turn into characters */

4190

4191

if (c >= 0) goto NON_SPECIAL_CHARACTER;

4192

4193

/* Escapes that are meta-things. The normal ones just affect the

4194

bit map, but Unicode properties require an XCLASS extended item. */

4195

4196

else

4197

{

4198

class_optcount = 10; /* \d, \s etc; make sure > 1 */

4199

#ifdef SUPPORT_UTF8

4200

if (-c == ESC_p || -c == ESC_P)

4201

{

4202

if (!class_utf8)

4203

{

4204

class_utf8 = TRUE;

4205

length += LINK_SIZE + 2;

4206

}

4207

length += 2;

4208

}

4209

#endif

4210

}

4211

}

4212

4213

/* Check the syntax for POSIX stuff. The bits we actually handle are

4214

checked during the real compile phase. */

4215

4216

else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))

4217

{

4218

ptr++;

4219

class_optcount = 10; /* Make sure > 1 */

4220

}

4221

4222

/* Anything else increments the possible optimization count. We have to

4223

detect ranges here so that we can compute the number of extra ranges for

4224

caseless wide characters when UCP support is available. If there are wide

4225

characters, we are going to have to use an XCLASS, even for single

4226

characters. */

4227

4228

else

4229

{

4230

int d;

4231

4232

GET_ONE_CHARACTER:

4233

4234

#ifdef SUPPORT_UTF8

4235

if (utf8)

4236

{

4237

int extra = 0;

4238

GETCHARLEN(c, ptr, extra);

4239

ptr += extra;

4240

}

4241

else c = *ptr;

4242

#else

4243

c = *ptr;

4244

#endif

4245

4246

/* Come here from handling \ above when it escapes to a char value */

4247

4248

NON_SPECIAL_CHARACTER:

4249

class_optcount++;

4250

4251

d = -1;

4252

if (ptr[1] == '-')

4253

{

4254

uschar const *hyptr = ptr++;

4255

if (ptr[1] == '\\')

4256

{

4257

ptr++;

4258

d = check_escape(&ptr, &errorcode, bracount, options, TRUE);

4259

if (errorcode != 0) goto PCRE_ERROR_RETURN;

4260

if (-d == ESC_b) d = '\b'; /* backspace */

4261

else if (-d == ESC_X) d = 'X'; /* literal X in a class */

4262

}

4263

else if (ptr[1] != 0 && ptr[1] != ']')

4264

{

4265

ptr++;

4266

#ifdef SUPPORT_UTF8

4267

if (utf8)

4268

{

4269

int extra = 0;

4270

GETCHARLEN(d, ptr, extra);

4271

ptr += extra;

4272

}

4273

else

4274

#endif

4275

d = *ptr;

4276

}

4277

if (d < 0) ptr = hyptr; /* go back to hyphen as data */

4278

}

4279

4280

/* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >

4281

127 for caseless matching, we will need to use an XCLASS. */

4282

4283

if (d >= 0)

4284

{

4285

class_optcount = 10; /* Ensure > 1 */

4286

if (d < c)

4287

{

4288

errorcode = ERR8;

4289

goto PCRE_ERROR_RETURN;

4290

}

4291

4292

#ifdef SUPPORT_UTF8

4293

if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))

4294

{

4295

uschar buffer[6];

4296

if (!class_utf8) /* Allow for XCLASS overhead */

4297

{

4298

class_utf8 = TRUE;

4299

length += LINK_SIZE + 2;

4300

}

4301

4302

#ifdef SUPPORT_UCP

4303

/* If we have UCP support, find out how many extra ranges are

4304

needed to map the other case of characters within this range. We

4305

have to mimic the range optimization here, because extending the

4306

range upwards might push d over a boundary that makes is use

4307

another byte in the UTF-8 representation. */

4308

4309

if ((options & PCRE_CASELESS) != 0)

4310

{

4311

int occ, ocd;

4312

int cc = c;

4313

int origd = d;

4314

while (get_othercase_range(&cc, origd, &occ, &ocd))

4315

{

4316

if (occ >= c && ocd <= d) continue; /* Skip embedded */

4317

4318

if (occ < c && ocd >= c - 1) /* Extend the basic range */

4319

{ /* if there is overlap, */

4320

c = occ; /* noting that if occ < c */

4321

continue; /* we can't have ocd > d */

4322

} /* because a subrange is */

4323

if (ocd > d && occ <= d + 1) /* always shorter than */

4324

{ /* the basic range. */

4325

d = ocd;

4326

continue;

4327

}

4328

4329

/* An extra item is needed */

4330

4331

length += 1 + _pcre_ord2utf8(occ, buffer) +

4332

((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));

4333

}

4334

}

4335

#endif /* SUPPORT_UCP */

4336

4337

/* The length of the (possibly extended) range */

4338

4339

length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);

4340

}

4341

#endif /* SUPPORT_UTF8 */

4342

4343

}

4344

4345

/* We have a single character. There is nothing to be done unless we

4346

are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must

4347

allow for an XCL_SINGLE item, doubled for caselessness if there is UCP

4348

support. */

4349

4350

else

4351

{

4352

#ifdef SUPPORT_UTF8

4353

if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))

4354

{

4355

uschar buffer[6];

4356

class_optcount = 10; /* Ensure > 1 */

4357

if (!class_utf8) /* Allow for XCLASS overhead */

4358

{

4359

class_utf8 = TRUE;

4360

length += LINK_SIZE + 2;

4361

}

4362

#ifdef SUPPORT_UCP

4363

length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *

4364

(1 + _pcre_ord2utf8(c, buffer));

4365

#else /* SUPPORT_UCP */

4366

length += 1 + _pcre_ord2utf8(c, buffer);

4367

#endif /* SUPPORT_UCP */

4368

}

4369

#endif /* SUPPORT_UTF8 */

4370

}

4371

}

4372

}

4373

while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */

4374

4375

if (*ptr == 0) /* Missing terminating ']' */

4376

{

4377

errorcode = ERR6;

4378

goto PCRE_ERROR_RETURN;

4379

}

4380

4381

/* We can optimize when there was only one optimizable character. Repeats

4382

for positive and negated single one-byte chars are handled by the general

4383

code. Here, we handle repeats for the class opcodes. */

4384

4385

if (class_optcount == 1) length += 3; else

4386

{

4387

length += 33;

4388

4389

/* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,

4390

we also need extra for wrapping the whole thing in a sub-pattern. */

4391

4392

if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))

4393

{

4394

ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);

4395

if (errorcode != 0) goto PCRE_ERROR_RETURN;

4396

if ((min == 0 && (max == 1 || max == -1)) ||

4397

(min == 1 && max == -1))

4398

length++;

4399

else length += 5;

4400

if (ptr[1] == '+')

4401

{

4402

ptr++;

4403

length += 2 + 2*LINK_SIZE;

4404

}

4405

else if (ptr[1] == '?') ptr++;

4406

}

4407

}

4408

continue;

4409

4410

/* Brackets may be genuine groups or special things */

4411

4412

case '(':

4413

branch_newextra = 0;

4414

bracket_length = 1 + LINK_SIZE;

4415

4416

/* Handle special forms of bracket, which all start (? */

4417

4418

if (ptr[1] == '?')

4419

{

4420

int set, unset;

4421

int *optset;

4422

4423

switch (c = ptr[2])

4424

{

4425

/* Skip over comments entirely */

4426

case '#':

4427

ptr += 3;

4428

while (*ptr != 0 && *ptr != ')') ptr++;

4429

if (*ptr == 0)

4430

{

4431

errorcode = ERR18;

4432

goto PCRE_ERROR_RETURN;

4433

}

4434

continue;

4435

4436

/* Non-referencing groups and lookaheads just move the pointer on, and

4437

then behave like a non-special bracket, except that they don't increment

4438

the count of extracting brackets. Ditto for the "once only" bracket,

4439

which is in Perl from version 5.005. */

4440

4441

case ':':

4442

case '=':

4443

case '!':

4444

case '>':

4445

ptr += 2;

4446

break;

4447

4448

/* (?R) specifies a recursive call to the regex, which is an extension

4449

to provide the facility which can be obtained by (?p{perl-code}) in

4450

Perl 5.6. In Perl 5.8 this has become (??{perl-code}).

4451

4452

From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to

4453

the appropriate numbered brackets. This includes both recursive and

4454

non-recursive calls. (?R) is now synonymous with (?0). */

4455

4456

case 'R':

4457

ptr++;

4458

4459

case '0': case '1': case '2': case '3': case '4':

4460

case '5': case '6': case '7': case '8': case '9':

4461

ptr += 2;

4462

if (c != 'R')

4463

while ((digitab[*(++ptr)] & ctype_digit) != 0);

4464

if (*ptr != ')')

4465

{

4466

errorcode = ERR29;

4467

goto PCRE_ERROR_RETURN;

4468

}

4469

length += 1 + LINK_SIZE;

4470

4471

/* If this item is quantified, it will get wrapped inside brackets so

4472

as to use the code for quantified brackets. We jump down and use the

4473

code that handles this for real brackets. */

4474

4475

if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')

4476

{

4477

length += 2 + 2 * LINK_SIZE; /* to make bracketed */

4478

duplength = 5 + 3 * LINK_SIZE;

4479

goto HANDLE_QUANTIFIED_BRACKETS;

4480

}

4481

continue;

4482

4483

/* (?C) is an extension which provides "callout" - to provide a bit of

4484

the functionality of the Perl (?{...}) feature. An optional number may

4485

follow (default is zero). */

4486

4487

case 'C':

4488

ptr += 2;

4489

while ((digitab[*(++ptr)] & ctype_digit) != 0);

4490

if (*ptr != ')')

4491

{

4492

errorcode = ERR39;

4493

goto PCRE_ERROR_RETURN;

4494

}

4495

length += 2 + 2*LINK_SIZE;

4496

continue;

4497

4498

/* Named subpatterns are an extension copied from Python */

4499

4500

case 'P':

4501

ptr += 3;

4502

if (*ptr == '<')

4503

{

4504

const uschar *p; /* Don't amalgamate; some compilers */

4505

p = ++ptr; /* grumble at autoincrement in declaration */

4506

while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;

4507

if (*ptr != '>')

4508

{

4509

errorcode = ERR42;

4510

goto PCRE_ERROR_RETURN;

4511

}

4512

name_count++;

4513

if (ptr - p > max_name_size) max_name_size = (ptr - p);

4514

break;

4515

}

4516

4517

if (*ptr == '=' || *ptr == '>')

4518

{

4519

while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);

4520

if (*ptr != ')')

4521

{

4522

errorcode = ERR42;

4523

goto PCRE_ERROR_RETURN;

4524

}

4525

break;

4526

}

4527

4528

/* Unknown character after (?P */

4529

4530

errorcode = ERR41;

4531

goto PCRE_ERROR_RETURN;

4532

4533

/* Lookbehinds are in Perl from version 5.005 */

4534

4535

case '<':

4536

ptr += 3;

4537

if (*ptr == '=' || *ptr == '!')

4538

{

4539

branch_newextra = 1 + LINK_SIZE;

4540

length += 1 + LINK_SIZE; /* For the first branch */

4541

break;

4542

}

4543

errorcode = ERR24;

4544

goto PCRE_ERROR_RETURN;

4545

4546

/* Conditionals are in Perl from version 5.005. The bracket must either

4547

be followed by a number (for bracket reference) or by an assertion

4548

group, or (a PCRE extension) by 'R' for a recursion test. */

4549

4550

case '(':

4551

if (ptr[3] == 'R' && ptr[4] == ')')

4552

{

4553

ptr += 4;

4554

length += 3;

4555

}

4556

else if ((digitab[ptr[3]] & ctype_digit) != 0)

4557

{

4558

ptr += 4;

4559

length += 3;

4560

while ((digitab[*ptr] & ctype_digit) != 0) ptr++;

4561

if (*ptr != ')')

4562

{

4563

errorcode = ERR26;

4564

goto PCRE_ERROR_RETURN;

4565

}

4566

}

4567

else /* An assertion must follow */

4568

{

4569

ptr++; /* Can treat like ':' as far as spacing is concerned */

4570

if (ptr[2] != '?' ||

4571

(ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )

4572

{

4573

ptr += 2; /* To get right offset in message */

4574

errorcode = ERR28;

4575

goto PCRE_ERROR_RETURN;

4576

}

4577

}

4578

break;

4579

4580

/* Else loop checking valid options until ) is met. Anything else is an

4581

error. If we are without any brackets, i.e. at top level, the settings

4582

act as if specified in the options, so massage the options immediately.

4583

This is for backward compatibility with Perl 5.004. */

4584

4585

default:

4586

set = unset = 0;

4587

optset = &set;

4588

ptr += 2;

4589

4590

for (;; ptr++)

4591

{

4592

c = *ptr;

4593

switch (c)

4594

{

4595

case 'i':

4596

*optset |= PCRE_CASELESS;

4597

continue;

4598

4599

case 'm':

4600

*optset |= PCRE_MULTILINE;

4601

continue;

4602

4603

case 's':

4604

*optset |= PCRE_DOTALL;

4605

continue;

4606

4607

case 'x':

4608

*optset |= PCRE_EXTENDED;

4609

continue;

4610

4611

case 'X':

4612

*optset |= PCRE_EXTRA;

4613

continue;

4614

4615

case 'U':

4616

*optset |= PCRE_UNGREEDY;

4617

continue;

4618

4619

case '-':

4620

optset = &unset;

4621

continue;

4622

4623

/* A termination by ')' indicates an options-setting-only item; if

4624

this is at the very start of the pattern (indicated by item_count

4625

being zero), we use it to set the global options. This is helpful

4626

when analyzing the pattern for first characters, etc. Otherwise

4627

nothing is done here and it is handled during the compiling

4628

process.

4629

4630

[Historical note: Up to Perl 5.8, options settings at top level

4631

were always global settings, wherever they appeared in the pattern.

4632

That is, they were equivalent to an external setting. From 5.8

4633

onwards, they apply only to what follows (which is what you might

4634

expect).] */

4635

4636

case ')':

4637

if (item_count == 0)

4638

{

4639

options = (options | set) & (~unset);

4640

set = unset = 0; /* To save length */

4641

item_count--; /* To allow for several */

4642

}

4643

4644

/* Fall through */

4645

4646

/* A termination by ':' indicates the start of a nested group with

4647

the given options set. This is again handled at compile time, but

4648

we must allow for compiled space if any of the ims options are

4649

set. We also have to allow for resetting space at the end of

4650

the group, which is why 4 is added to the length and not just 2.

4651

If there are several changes of options within the same group, this

4652

will lead to an over-estimate on the length, but this shouldn't

4653

matter very much. We also have to allow for resetting options at

4654

the start of any alternations, which we do by setting

4655

branch_newextra to 2. Finally, we record whether the case-dependent

4656

flag ever changes within the regex. This is used by the "required

4657

character" code. */

4658

4659

case ':':

4660

if (((set|unset) & PCRE_IMS) != 0)

4661

{

4662

length += 4;

4663

branch_newextra = 2;

4664

if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;

4665

}

4666

goto END_OPTIONS;

4667

4668

/* Unrecognized option character */

4669

4670

default:

4671

errorcode = ERR12;

4672

goto PCRE_ERROR_RETURN;

4673

}

4674

}

4675

4676

/* If we hit a closing bracket, that's it - this is a freestanding

4677

option-setting. We need to ensure that branch_extra is updated if

4678

necessary. The only values branch_newextra can have here are 0 or 2.

4679

If the value is 2, then branch_extra must either be 2 or 5, depending

4680

on whether this is a lookbehind group or not. */

4681

4682

END_OPTIONS:

4683

if (c == ')')

4684

{

4685

if (branch_newextra == 2 &&

4686

(branch_extra == 0 || branch_extra == 1+LINK_SIZE))

4687

branch_extra += branch_newextra;

4688

continue;

4689

}

4690

4691

/* If options were terminated by ':' control comes here. Fall through

4692

to handle the group below. */

4693

}

4694

}

4695

4696

/* Extracting brackets must be counted so we can process escapes in a

4697

Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to

4698

need an additional 3 bytes of store per extracting bracket. However, if

4699

PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we

4700

must leave the count alone (it will aways be zero). */

4701

4702

else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)

4703

{

4704

bracount++;

4705

if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;

4706

}

4707

4708

/* Save length for computing whole length at end if there's a repeat that

4709

requires duplication of the group. Also save the current value of

4710

branch_extra, and start the new group with the new value. If non-zero, this

4711

will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */

4712

4713

if (brastackptr >= sizeof(brastack)/sizeof(int))

4714

{

4715

errorcode = ERR19;

4716

goto PCRE_ERROR_RETURN;

4717

}

4718

4719

bralenstack[brastackptr] = branch_extra;

4720

branch_extra = branch_newextra;

4721

4722

brastack[brastackptr++] = length;

4723

length += bracket_length;

4724

continue;

4725

4726

/* Handle ket. Look for subsequent max/min; for certain sets of values we

4727

have to replicate this bracket up to that many times. If brastackptr is

4728

0 this is an unmatched bracket which will generate an error, but take care

4729

not to try to access brastack[-1] when computing the length and restoring

4730

the branch_extra value. */

4731

4732

case ')':

4733

length += 1 + LINK_SIZE;

4734

if (brastackptr > 0)

4735

{

4736

duplength = length - brastack[--brastackptr];

4737

branch_extra = bralenstack[brastackptr];

4738

}

4739

else duplength = 0;

4740

4741

/* The following code is also used when a recursion such as (?3) is

4742

followed by a quantifier, because in that case, it has to be wrapped inside

4743

brackets so that the quantifier works. The value of duplength must be

4744

set before arrival. */

4745

4746

HANDLE_QUANTIFIED_BRACKETS:

4747

4748

/* Leave ptr at the final char; for read_repeat_counts this happens

4749

automatically; for the others we need an increment. */

4750

4751

if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))

4752

{

4753

ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);

4754

if (errorcode != 0) goto PCRE_ERROR_RETURN;

4755

}

4756

else if (c == '*') { min = 0; max = -1; ptr++; }

4757

else if (c == '+') { min = 1; max = -1; ptr++; }

4758

else if (c == '?') { min = 0; max = 1; ptr++; }

4759

else { min = 1; max = 1; }

4760

4761

/* If the minimum is zero, we have to allow for an OP_BRAZERO before the

4762

group, and if the maximum is greater than zero, we have to replicate

4763

maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting

4764

bracket set. */

4765

4766

if (min == 0)

4767

{

4768

length++;

4769

if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);

4770

}

4771

4772

/* When the minimum is greater than zero, we have to replicate up to

4773

minval-1 times, with no additions required in the copies. Then, if there

4774

is a limited maximum we have to replicate up to maxval-1 times allowing

4775

for a BRAZERO item before each optional copy and nesting brackets for all

4776

but one of the optional copies. */

4777

4778

else

4779

{

4780

length += (min - 1) * duplength;

4781

if (max > min) /* Need this test as max=-1 means no limit */

4782

length += (max - min) * (duplength + 3 + 2*LINK_SIZE)

4783

- (2 + 2*LINK_SIZE);

4784

}

4785

4786

/* Allow space for once brackets for "possessive quantifier" */

4787

4788

if (ptr[1] == '+')

4789

{

4790

ptr++;

4791

length += 2 + 2*LINK_SIZE;

4792

}

4793

continue;

4794

4795

/* Non-special character. It won't be space or # in extended mode, so it is

4796

always a genuine character. If we are in a \Q...\E sequence, check for the

4797

end; if not, we have a literal. */

4798

4799

default:

4800

NORMAL_CHAR:

4801

4802

if (inescq && c == '\\' && ptr[1] == 'E')

4803

{

4804

inescq = FALSE;

4805

ptr++;

4806

continue;

4807

}

4808

4809

length += 2; /* For a one-byte character */

4810

lastitemlength = 1; /* Default length of last item for repeats */

4811

4812

/* In UTF-8 mode, check for additional bytes. */

4813

4814

#ifdef SUPPORT_UTF8

4815

if (utf8 && (c & 0xc0) == 0xc0)

4816

{

4817

while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */

4818

{ /* because the end is marked */

4819

lastitemlength++; /* by a zero byte. */

4820

length++;

4821

ptr++;

4822

}

4823

}

4824

#endif

4825

4826

continue;

4827

}

4828

}

4829

4830

length += 2 + LINK_SIZE; /* For final KET and END */

4831

4832

if ((options & PCRE_AUTO_CALLOUT) != 0)

4833

length += 2 + 2*LINK_SIZE; /* For final callout */

4834

4835

if (length > MAX_PATTERN_SIZE)

4836

{

4837

errorcode = ERR20;

4838

goto PCRE_EARLY_ERROR_RETURN;

4839

}

4840

4841

/* Compute the size of data block needed and get it, either from malloc or

4842

externally provided function. */

4843

4844

size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);

4845

re = (real_pcre *)(pcre_malloc)(size);

4846

4847

if (re == NULL)

4848

{

4849

errorcode = ERR21;

4850

goto PCRE_EARLY_ERROR_RETURN;

4851

}

4852

4853

/* Put in the magic number, and save the sizes, options, and character table

4854

pointer. NULL is used for the default character tables. The nullpad field is at

4855

the end; it's there to help in the case when a regex compiled on a system with

4856

4-byte pointers is run on another with 8-byte pointers. */

4857

4858

re->magic_number = MAGIC_NUMBER;

4859

re->size = size;

4860

re->options = options;

4861

re->dummy1 = 0;

4862

re->name_table_offset = sizeof(real_pcre);

4863

re->name_entry_size = max_name_size + 3;

4864

re->name_count = name_count;

4865

re->ref_count = 0;

4866

re->tables = (tables == _pcre_default_tables)? NULL : tables;

4867

re->nullpad = NULL;

4868

4869

/* The starting points of the name/number translation table and of the code are

4870

passed around in the compile data block. */

4871

4872

compile_block.names_found = 0;

4873

compile_block.name_entry_size = max_name_size + 3;

4874

compile_block.name_table = (uschar *)re + re->name_table_offset;

4875

codestart = compile_block.name_table + re->name_entry_size * re->name_count;

4876

compile_block.start_code = codestart;

4877

compile_block.start_pattern = (const uschar *)pattern;

4878

compile_block.req_varyopt = 0;

4879

compile_block.nopartial = FALSE;

4880

4881

/* Set up a starting, non-extracting bracket, then compile the expression. On

4882

error, errorcode will be set non-zero, so we don't need to look at the result

4883

of the function here. */

4884

4885

ptr = (const uschar *)pattern;

4886

code = (uschar *)codestart;

4887

*code = OP_BRA;

4888

bracount = 0;

4889

(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,

4890

&errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);

4891

re->top_bracket = bracount;

4892

re->top_backref = compile_block.top_backref;

4893

4894

if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;

4895

4896

/* If not reached end of pattern on success, there's an excess bracket. */

4897

4898

if (errorcode == 0 && *ptr != 0) errorcode = ERR22;

4899

4900

/* Fill in the terminating state and check for disastrous overflow, but

4901

if debugging, leave the test till after things are printed out. */

4902

4903

*code++ = OP_END;

4904

4905

#ifndef DEBUG

4906

if (code - codestart > length) errorcode = ERR23;

4907

#endif

4908

4909

/* Give an error if there's back reference to a non-existent capturing

4910

subpattern. */

4911

4912

if (re->top_backref > re->top_bracket) errorcode = ERR15;

4913

4914

/* Failed to compile, or error while post-processing */

4915

4916

if (errorcode != 0)

4917

{

4918

(pcre_free)(re);

4919

PCRE_ERROR_RETURN:

4920

*erroroffset = ptr - (const uschar *)pattern;

4921

PCRE_EARLY_ERROR_RETURN:

4922

*errorptr = error_texts[errorcode];

4923

if (errorcodeptr != NULL) *errorcodeptr = errorcode;

4924

return NULL;

4925

}

4926

4927

/* If the anchored option was not passed, set the flag if we can determine that

4928

the pattern is anchored by virtue of ^ characters or \A or anything else (such

4929

as starting with .* when DOTALL is set).

4930

4931

Otherwise, if we know what the first character has to be, save it, because that

4932

speeds up unanchored matches no end. If not, see if we can set the

4933

PCRE_STARTLINE flag. This is helpful for multiline matches when all branches

4934

start with ^. and also when all branches start with .* for non-DOTALL matches.

4935

4936

4937

if ((options & PCRE_ANCHORED) == 0)

4938

{

4939

int temp_options = options;

4940

if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))

4941

re->options |= PCRE_ANCHORED;

4942

else

4943

{

4944

if (firstbyte < 0)

4945

firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);

4946

if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */

4947

{

4948

int ch = firstbyte & 255;

4949

re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&

4950

compile_block.fcc[ch] == ch)? ch : firstbyte;

4951

re->options |= PCRE_FIRSTSET;

4952

}

4953

else if (is_startline(codestart, 0, compile_block.backref_map))

4954

re->options |= PCRE_STARTLINE;

4955

}

4956

}

4957

4958

/* For an anchored pattern, we use the "required byte" only if it follows a

4959

variable length item in the regex. Remove the caseless flag for non-caseable

4960

bytes. */

4961

4962

if (reqbyte >= 0 &&

4963

((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))

4964

{

4965

int ch = reqbyte & 255;

4966

re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&

4967

compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;

4968

re->options |= PCRE_REQCHSET;

4969

}

4970

4971

/* Print out the compiled data for debugging */

4972

4973

#ifdef DEBUG

4974

4975

printf("Length = %d top_bracket = %d top_backref = %d\n",

4976

length, re->top_bracket, re->top_backref);

4977

4978

if (re->options != 0)

4979

{

4980

printf("%s%s%s%s%s%s%s%s%s%s\n",

4981

((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",

4982

((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",

4983

((re->options & PCRE_CASELESS) != 0)? "caseless " : "",

4984

((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",

4985

((re->options & PCRE_EXTENDED) != 0)? "extended " : "",

4986

((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",

4987

((re->options & PCRE_DOTALL) != 0)? "dotall " : "",

4988

((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",

4989

((re->options & PCRE_EXTRA) != 0)? "extra " : "",

4990

((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");

4991

}

4992

4993

if ((re->options & PCRE_FIRSTSET) != 0)

4994

{

4995

int ch = re->first_byte & 255;

4996

const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";

4997

if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);

4998

else printf("First char = \\x%02x%s\n", ch, caseless);

4999

}

5000

5001

if ((re->options & PCRE_REQCHSET) != 0)

5002

{

5003

int ch = re->req_byte & 255;

5004

const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";

5005

if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);

5006

else printf("Req char = \\x%02x%s\n", ch, caseless);

5007

}

5008

5009

_pcre_printint(re, stdout);

5010

5011

/* This check is done here in the debugging case so that the code that

5012

was compiled can be seen. */

5013

5014

if (code - codestart > length)

5015

{

5016

(pcre_free)(re);

5017

*errorptr = error_texts[ERR23];

5018

*erroroffset = ptr - (uschar *)pattern;

5019

if (errorcodeptr != NULL) *errorcodeptr = ERR23;

5020

return NULL;

5021

}

5022

#endif

5023

5024

return (pcre *)re;

5025

}

5026

5027

/* End of pcre_compile.c */

Older »