~ubuntu-branches/ubuntu/wily/tora/wily-proposed

« back to all changes in this revision

Viewing changes to src/qscintilla2/src/RESearch.cpp

Committer: Bazaar Package Importer
Author(s): Michael Meskes
Date: 2009-11-19 15:18:19 UTC
mfrom: (1.2.9 upstream) (3.3.3 squeeze)
Revision ID: james.westby@ubuntu.com-20091119151819-me89ezmxzkvl0lws

Tags: 2.1.1-1

New upstream version.

files added:
osx_tools

osx_tools/CompleteBundle.cmake.in

osx_tools/CreateBundle.sh.in

osx_tools/Info.plist.in

osx_tools/dmg

osx_tools/dmg/background-page001.png

osx_tools/dmg/background.sla

osx_tools/dmg/readme.html

osx_tools/dmg/tora.dmgCanvas

osx_tools/qt.conf.in

src/qscintilla2

src/qscintilla2/CMakeLists.txt

src/qscintilla2/ChangeLog

src/qscintilla2/Qt4

src/qscintilla2/Qt4/ListBoxQt.cpp

src/qscintilla2/Qt4/ListBoxQt.h

src/qscintilla2/Qt4/PlatQt.cpp

src/qscintilla2/Qt4/Qsci

src/qscintilla2/Qt4/Qsci/qsciabstractapis.h

src/qscintilla2/Qt4/Qsci/qsciapis.h

src/qscintilla2/Qt4/Qsci/qscicommand.h

src/qscintilla2/Qt4/Qsci/qscicommandset.h

src/qscintilla2/Qt4/Qsci/qscidocument.h

src/qscintilla2/Qt4/Qsci/qsciglobal.h

src/qscintilla2/Qt4/Qsci/qscilexer.h

src/qscintilla2/Qt4/Qsci/qscilexerbash.h

src/qscintilla2/Qt4/Qsci/qscilexerbatch.h

src/qscintilla2/Qt4/Qsci/qscilexercmake.h

src/qscintilla2/Qt4/Qsci/qscilexercpp.h

src/qscintilla2/Qt4/Qsci/qscilexercsharp.h

src/qscintilla2/Qt4/Qsci/qscilexercss.h

src/qscintilla2/Qt4/Qsci/qscilexerd.h

src/qscintilla2/Qt4/Qsci/qscilexerdiff.h

src/qscintilla2/Qt4/Qsci/qscilexerfortran.h

src/qscintilla2/Qt4/Qsci/qscilexerfortran77.h

src/qscintilla2/Qt4/Qsci/qscilexerhtml.h

src/qscintilla2/Qt4/Qsci/qscilexeridl.h

src/qscintilla2/Qt4/Qsci/qscilexerjava.h

src/qscintilla2/Qt4/Qsci/qscilexerjavascript.h

src/qscintilla2/Qt4/Qsci/qscilexerlua.h

src/qscintilla2/Qt4/Qsci/qscilexermakefile.h

src/qscintilla2/Qt4/Qsci/qscilexerpascal.h

src/qscintilla2/Qt4/Qsci/qscilexerperl.h

src/qscintilla2/Qt4/Qsci/qscilexerpostscript.h

src/qscintilla2/Qt4/Qsci/qscilexerpov.h

src/qscintilla2/Qt4/Qsci/qscilexerproperties.h

src/qscintilla2/Qt4/Qsci/qscilexerpython.h

src/qscintilla2/Qt4/Qsci/qscilexerruby.h

src/qscintilla2/Qt4/Qsci/qscilexersql.h

src/qscintilla2/Qt4/Qsci/qscilexertcl.h

src/qscintilla2/Qt4/Qsci/qscilexertex.h

src/qscintilla2/Qt4/Qsci/qscilexervhdl.h

src/qscintilla2/Qt4/Qsci/qscilexerxml.h

src/qscintilla2/Qt4/Qsci/qscilexeryaml.h

src/qscintilla2/Qt4/Qsci/qscimacro.h

src/qscintilla2/Qt4/Qsci/qsciprinter.h

src/qscintilla2/Qt4/Qsci/qsciscintilla.h

src/qscintilla2/Qt4/Qsci/qsciscintillabase.h

src/qscintilla2/Qt4/SciClasses.cpp

src/qscintilla2/Qt4/SciClasses.h

src/qscintilla2/Qt4/ScintillaQt.cpp

src/qscintilla2/Qt4/ScintillaQt.h

src/qscintilla2/Qt4/qsciabstractapis.cpp

src/qscintilla2/Qt4/qsciapis.cpp

src/qscintilla2/Qt4/qscicommand.cpp

src/qscintilla2/Qt4/qscicommandset.cpp

src/qscintilla2/Qt4/qscidocument.cpp

src/qscintilla2/Qt4/qscilexer.cpp

src/qscintilla2/Qt4/qscilexerbash.cpp

src/qscintilla2/Qt4/qscilexerbatch.cpp

src/qscintilla2/Qt4/qscilexercmake.cpp

src/qscintilla2/Qt4/qscilexercpp.cpp

src/qscintilla2/Qt4/qscilexercsharp.cpp

src/qscintilla2/Qt4/qscilexercss.cpp

src/qscintilla2/Qt4/qscilexerd.cpp

src/qscintilla2/Qt4/qscilexerdiff.cpp

src/qscintilla2/Qt4/qscilexerfortran.cpp

src/qscintilla2/Qt4/qscilexerfortran77.cpp

src/qscintilla2/Qt4/qscilexerhtml.cpp

src/qscintilla2/Qt4/qscilexeridl.cpp

src/qscintilla2/Qt4/qscilexerjava.cpp

src/qscintilla2/Qt4/qscilexerjavascript.cpp

src/qscintilla2/Qt4/qscilexerlua.cpp

src/qscintilla2/Qt4/qscilexermakefile.cpp

src/qscintilla2/Qt4/qscilexerpascal.cpp

src/qscintilla2/Qt4/qscilexerperl.cpp

src/qscintilla2/Qt4/qscilexerpostscript.cpp

src/qscintilla2/Qt4/qscilexerpov.cpp

src/qscintilla2/Qt4/qscilexerproperties.cpp

src/qscintilla2/Qt4/qscilexerpython.cpp

src/qscintilla2/Qt4/qscilexerruby.cpp

src/qscintilla2/Qt4/qscilexersql.cpp

src/qscintilla2/Qt4/qscilexertcl.cpp

src/qscintilla2/Qt4/qscilexertex.cpp

src/qscintilla2/Qt4/qscilexervhdl.cpp

src/qscintilla2/Qt4/qscilexerxml.cpp

src/qscintilla2/Qt4/qscilexeryaml.cpp

src/qscintilla2/Qt4/qscimacro.cpp

src/qscintilla2/Qt4/qsciprinter.cpp

src/qscintilla2/Qt4/qsciscintilla.cpp

src/qscintilla2/Qt4/qsciscintillabase.cpp

src/qscintilla2/include

src/qscintilla2/include/Accessor.h

src/qscintilla2/include/Face.py

src/qscintilla2/include/HFacer.py

src/qscintilla2/include/KeyWords.h

src/qscintilla2/include/License.txt

src/qscintilla2/include/Platform.h

src/qscintilla2/include/PropSet.h

src/qscintilla2/include/SString.h

src/qscintilla2/include/SciLexer.h

src/qscintilla2/include/Scintilla.h

src/qscintilla2/include/Scintilla.iface

src/qscintilla2/include/ScintillaWidget.h

src/qscintilla2/include/WindowAccessor.h

src/qscintilla2/src

src/qscintilla2/src/AutoComplete.cpp

src/qscintilla2/src/AutoComplete.h

src/qscintilla2/src/CallTip.cpp

src/qscintilla2/src/CallTip.h

src/qscintilla2/src/CellBuffer.cpp

src/qscintilla2/src/CellBuffer.h

src/qscintilla2/src/CharClassify.cpp

src/qscintilla2/src/CharClassify.h

src/qscintilla2/src/CharacterSet.h

src/qscintilla2/src/ContractionState.cpp

src/qscintilla2/src/ContractionState.h

src/qscintilla2/src/Decoration.cpp

src/qscintilla2/src/Decoration.h

src/qscintilla2/src/Document.cpp

src/qscintilla2/src/Document.h

src/qscintilla2/src/DocumentAccessor.cpp

src/qscintilla2/src/DocumentAccessor.h

src/qscintilla2/src/Editor.cpp

src/qscintilla2/src/Editor.h

src/qscintilla2/src/ExternalLexer.cpp

src/qscintilla2/src/ExternalLexer.h

src/qscintilla2/src/Indicator.cpp

src/qscintilla2/src/Indicator.h

src/qscintilla2/src/KeyMap.cpp

src/qscintilla2/src/KeyMap.h

src/qscintilla2/src/KeyWords.cpp

src/qscintilla2/src/LexAPDL.cpp

src/qscintilla2/src/LexASY.cpp

src/qscintilla2/src/LexAU3.cpp

src/qscintilla2/src/LexAVE.cpp

src/qscintilla2/src/LexAbaqus.cpp

src/qscintilla2/src/LexAda.cpp

src/qscintilla2/src/LexAsm.cpp

src/qscintilla2/src/LexAsn1.cpp

src/qscintilla2/src/LexBaan.cpp

src/qscintilla2/src/LexBash.cpp

src/qscintilla2/src/LexBasic.cpp

src/qscintilla2/src/LexBullant.cpp

src/qscintilla2/src/LexCLW.cpp

src/qscintilla2/src/LexCPP.cpp

src/qscintilla2/src/LexCSS.cpp

src/qscintilla2/src/LexCaml.cpp

src/qscintilla2/src/LexCmake.cpp

src/qscintilla2/src/LexConf.cpp

src/qscintilla2/src/LexCrontab.cpp

src/qscintilla2/src/LexCsound.cpp

src/qscintilla2/src/LexD.cpp

src/qscintilla2/src/LexEScript.cpp

src/qscintilla2/src/LexEiffel.cpp

src/qscintilla2/src/LexErlang.cpp

src/qscintilla2/src/LexFlagship.cpp

src/qscintilla2/src/LexForth.cpp

src/qscintilla2/src/LexFortran.cpp

src/qscintilla2/src/LexGAP.cpp

src/qscintilla2/src/LexGen.py

src/qscintilla2/src/LexGui4Cli.cpp

src/qscintilla2/src/LexHTML.cpp

src/qscintilla2/src/LexHaskell.cpp

src/qscintilla2/src/LexInno.cpp

src/qscintilla2/src/LexKix.cpp

src/qscintilla2/src/LexLisp.cpp

src/qscintilla2/src/LexLout.cpp

src/qscintilla2/src/LexLua.cpp

src/qscintilla2/src/LexMMIXAL.cpp

src/qscintilla2/src/LexMPT.cpp

src/qscintilla2/src/LexMSSQL.cpp

src/qscintilla2/src/LexMagik.cpp

src/qscintilla2/src/LexMatlab.cpp

src/qscintilla2/src/LexMetapost.cpp

src/qscintilla2/src/LexMySQL.cpp

src/qscintilla2/src/LexNsis.cpp

src/qscintilla2/src/LexOpal.cpp

src/qscintilla2/src/LexOthers.cpp

src/qscintilla2/src/LexPB.cpp

src/qscintilla2/src/LexPLM.cpp

src/qscintilla2/src/LexPOV.cpp

src/qscintilla2/src/LexPS.cpp

src/qscintilla2/src/LexPascal.cpp

src/qscintilla2/src/LexPerl.cpp

src/qscintilla2/src/LexPowerShell.cpp

src/qscintilla2/src/LexProgress.cpp

src/qscintilla2/src/LexPython.cpp

src/qscintilla2/src/LexR.cpp

src/qscintilla2/src/LexRebol.cpp

src/qscintilla2/src/LexRuby.cpp

src/qscintilla2/src/LexSQL.cpp

src/qscintilla2/src/LexScriptol.cpp

src/qscintilla2/src/LexSmalltalk.cpp

src/qscintilla2/src/LexSpecman.cpp

src/qscintilla2/src/LexSpice.cpp

src/qscintilla2/src/LexTADS3.cpp

src/qscintilla2/src/LexTCL.cpp

src/qscintilla2/src/LexTeX.cpp

src/qscintilla2/src/LexVB.cpp

src/qscintilla2/src/LexVHDL.cpp

src/qscintilla2/src/LexVerilog.cpp

src/qscintilla2/src/LexYAML.cpp

src/qscintilla2/src/License.txt

src/qscintilla2/src/LineMarker.cpp

src/qscintilla2/src/LineMarker.h

src/qscintilla2/src/Partitioning.h

src/qscintilla2/src/PositionCache.cpp

src/qscintilla2/src/PositionCache.h

src/qscintilla2/src/PropSet.cpp

src/qscintilla2/src/RESearch.cpp

src/qscintilla2/src/RESearch.h

src/qscintilla2/src/RunStyles.cpp

src/qscintilla2/src/RunStyles.h

src/qscintilla2/src/SVector.h

src/qscintilla2/src/SciTE.properties

src/qscintilla2/src/ScintillaBase.cpp

src/qscintilla2/src/ScintillaBase.h

src/qscintilla2/src/SplitVector.h

src/qscintilla2/src/Style.cpp

src/qscintilla2/src/Style.h

src/qscintilla2/src/StyleContext.cpp

src/qscintilla2/src/StyleContext.h

src/qscintilla2/src/UniConversion.cpp

src/qscintilla2/src/UniConversion.h

src/qscintilla2/src/ViewStyle.cpp

src/qscintilla2/src/ViewStyle.h

src/qscintilla2/src/WindowAccessor.cpp

src/qscintilla2/src/XPM.cpp

src/qscintilla2/src/XPM.h

src/tora.rc

src/windows/dummyoci

src/windows/dummyoci/Makefile

src/windows/dummyoci/oci_nonstub.c

src/windows/dummyoci/oci_stub.c

files modified:
CMakeLists.txt

Makefile.am

Makefile.in

README.BINARIES

README.CMAKE

README.OSX

README.RELEASE

README.WINDOWS

cmake/modules/FindOracle.cmake

configure

configure.ac

debian/changelog

debian/patches/03_README.LAYOUT.dpatch *

doc/help/Makefile

rpm/tora.spec

rpm/tora.spec.in

src/CMakeLists.txt

src/Makefile.am

src/Makefile.in

src/toalert.h

src/todebugtext.h

src/tomain.cpp

src/tomain.h

src/toqsqlconnection.cpp

src/toresultplan.cpp

src/toresultview.h

src/torollback.cpp

src/torollback.h

src/tosqledit.cpp

src/tosyntaxsetup.cpp

src/totemporary.h

src/towaitevents.cpp

src/toworksheet.cpp

src/utils.cpp

Show diffs side-by-side

added added

removed removed

src/qscintilla2/src/RESearch.cpp

// Scintilla source code edit control

/** @file RESearch.cxx

** Regular expression search library.

**/

* regex - Regular expression pattern matching and replacement

* By: Ozan S. Yigit (oz)

* Dept. of Computer Science

* York University

* Original code available from http://www.cs.yorku.ca/~oz/

* Translation to C++ by Neil Hodgson neilh@scintilla.org

* Removed all use of register.

* Converted to modern function prototypes.

* Put all global/static variables into an object so this code can be

* used from multiple threads, etc.

* Some extensions by Philippe Lhoste PhiLho(a)GMX.net

* These routines are the PUBLIC DOMAIN equivalents of regex

* routines as found in 4.nBSD UN*X, with minor extensions.

* These routines are derived from various implementations found

* in software tools books, and Conroy's grep. They are NOT derived

* from licensed/restricted software.

* For more interesting/academic/complicated implementations,

* see Henry Spencer's regexp routines, or GNU Emacs pattern

* matching module.

* Modification history removed.

* Interfaces:

* RESearch::Compile: compile a regular expression into a NFA.

* const char *RESearch::Compile(const char *pattern, int length,

* bool caseSensitive, bool posix)

* Returns a short error string if they fail.

* RESearch::Execute: execute the NFA to match a pattern.

* int RESearch::Execute(characterIndexer &ci, int lp, int endp)

* RESearch::Substitute: substitute the matched portions in a new string.

* int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst)

* re_fail: failure routine for RESearch::Execute. (no longer used)

* void re_fail(char *msg, char op)

* Regular Expressions:

* [1] char matches itself, unless it is a special

* character (metachar): . \ [ ] * + ^ $

* and ( ) if posix option.

* [2] . matches any character.

* [3] \ matches the character following it, except:

* - \a, \b, \f, \n, \r, \t, \v match the corresponding C

* escape char, respectively BEL, BS, FF, LF, CR, TAB and VT;

* Note that \r and \n are never matched because Scintilla

* regex searches are made line per line

* (stripped of end-of-line chars).

* - if not in posix mode, when followed by a

* left or right round bracket (see [7]);

* - when followed by a digit 1 to 9 (see [8]);

* - when followed by a left or right angle bracket

* (see [9]);

* - when followed by d, D, s, S, w or W (see [10]);

* - when followed by x and two hexa digits (see [11].

* Backslash is used as an escape character for all

* other meta-characters, and itself.

* [4] [set] matches one of the characters in the set.

* If the first character in the set is "^",

* it matches the characters NOT in the set, i.e.

* complements the set. A shorthand S-E (start dash end)

* is used to specify a set of characters S up to

* E, inclusive. S and E must be characters, otherwise

* the dash is taken literally (eg. in expression [\d-a]).

* The special characters "]" and "-" have no special

* meaning if they appear as the first chars in the set.

* To include both, put - first: [-]A-Z]

* (or just backslash them).

* examples: match:

* [-]|] matches these 3 chars,

* []-|] matches from ] to | chars

* [a-z] any lowercase alpha

* [^-]] any char except - and ]

* [^A-Z] any char except uppercase

* alpha

100

101

* [a-zA-Z] any alpha

102

103

* [5] * any regular expression form [1] to [4]

104

* (except [7], [8] and [9] forms of [3]),

105

* followed by closure char (*)

106

* matches zero or more matches of that form.

107

108

* [6] + same as [5], except it matches one or more.

109

* Both [5] and [6] are greedy (they match as much as possible).

110

111

* [7] a regular expression in the form [1] to [12], enclosed

112

* as $form$ (or (form) with posix flag) matches what

113

* form matches. The enclosure creates a set of tags,

114

* used for [8] and for pattern substitution.

115

* The tagged forms are numbered starting from 1.

116

117

* [8] a \ followed by a digit 1 to 9 matches whatever a

118

* previously tagged regular expression ([7]) matched.

119

120

* [9] \< a regular expression starting with a \< construct

121

* \> and/or ending with a \> construct, restricts the

122

* pattern matching to the beginning of a word, and/or

123

* the end of a word. A word is defined to be a character

124

* string beginning and/or ending with the characters

125

* A-Z a-z 0-9 and _. Scintilla extends this definition

126

* by user setting. The word must also be preceded and/or

127

* followed by any character outside those mentioned.

128

129

* [10] \l a backslash followed by d, D, s, S, w or W,

130

* becomes a character class (both inside and

131

* outside sets []).

132

* d: decimal digits

133

* D: any char except decimal digits

134

* s: whitespace (space, \t \n \r \f \v)

135

* S: any char except whitespace (see above)

136

* w: alphanumeric & underscore (changed by user setting)

137

* W: any char except alphanumeric & underscore (see above)

138

139

* [11] \xHH a backslash followed by x and two hexa digits,

140

* becomes the character whose Ascii code is equal

141

* to these digits. If not followed by two digits,

142

* it is 'x' char itself.

143

144

* [12] a composite regular expression xy where x and y

145

* are in the form [1] to [11] matches the longest

146

* match of x followed by a match for y.

147

148

* [13] ^ a regular expression starting with a ^ character

149

* $ and/or ending with a $ character, restricts the

150

* pattern matching to the beginning of the line,

151

* or the end of line. [anchors] Elsewhere in the

152

* pattern, ^ and $ are treated as ordinary characters.

153

154

155

* Acknowledgements:

156

157

* HCR's Hugh Redelmeier has been most helpful in various

158

* stages of development. He convinced me to include BOW

159

* and EOW constructs, originally invented by Rob Pike at

160

* the University of Toronto.

161

162

* References:

163

* Software tools Kernighan & Plauger

164

* Software tools in Pascal Kernighan & Plauger

165

* Grep [rsx-11 C dist] David Conroy

166

* ed - text editor Un*x Programmer's Manual

167

* Advanced editing on Un*x B. W. Kernighan

168

* RegExp routines Henry Spencer

169

170

* Notes:

171

172

* This implementation uses a bit-set representation for character

173

* classes for speed and compactness. Each character is represented

174

* by one bit in a 256-bit block. Thus, CCL always takes a

175

* constant 32 bytes in the internal nfa, and RESearch::Execute does a single

176

* bit comparison to locate the character in the set.

177

178

* Examples:

179

180

* pattern: foo*.*

181

* compile: CHR f CHR o CLO CHR o END CLO ANY END END

182

* matches: fo foo fooo foobar fobar foxx ...

183

184

* pattern: fo[ob]a[rz]

185

* compile: CHR f CHR o CCL bitset CHR a CCL bitset END

186

* matches: fobar fooar fobaz fooaz

187

188

* pattern: foo\\+

189

* compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END

190

* matches: foo\ foo\\ foo\\\ ...

191

192

* pattern: $foo$[1-3]\1 (same as foo[1-3]foo)

193

* compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END

194

* matches: foo1foo foo2foo foo3foo

195

196

* pattern: $fo.*$-\1

197

* compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END

198

* matches: foo-foo fo-fo fob-fob foobar-foobar ...

199

200

201

#include "CharClassify.h"

202

#include "RESearch.h"

203

204

// Shut up annoying Visual C++ warnings:

205

#ifdef _MSC_VER

206

#pragma warning(disable: 4514)

207

#endif

208

209

#ifdef SCI_NAMESPACE

210

using namespace Scintilla;

211

#endif

212

213

#define OKP 1

214

#define NOP 0

215

216

#define CHR 1

217

#define ANY 2

218

#define CCL 3

219

#define BOL 4

220

#define EOL 5

221

#define BOT 6

222

#define EOT 7

223

#define BOW 8

224

#define EOW 9

225

#define REF 10

226

#define CLO 11

227

228

#define END 0

229

230

231

* The following defines are not meant to be changeable.

232

* They are for readability only.

233

234

#define BLKIND 0370

235

#define BITIND 07

236

237

const char bitarr[] = { 1, 2, 4, 8, 16, 32, 64, '\200' };

238

239

#define badpat(x) (*nfa = END, x)

240

241

242

* Character classification table for word boundary operators BOW

243

* and EOW is passed in by the creator of this object (Scintilla

244

* Document). The Document default state is that word chars are:

245

* 0-9, a-z, A-Z and _

246

247

248

RESearch::RESearch(CharClassify *charClassTable) {

249

charClass = charClassTable;

250

Init();

251

}

252

253

RESearch::~RESearch() {

254

Clear();

255

}

256

257

void RESearch::Init() {

258

sta = NOP; /* status of lastpat */

259

bol = 0;

260

for (int i = 0; i < MAXTAG; i++)

261

pat[i] = 0;

262

for (int j = 0; j < BITBLK; j++)

263

bittab[j] = 0;

264

}

265

266

void RESearch::Clear() {

267

for (int i = 0; i < MAXTAG; i++) {

268

delete []pat[i];

269

pat[i] = 0;

270

bopat[i] = NOTFOUND;

271

eopat[i] = NOTFOUND;

272

}

273

}

274

275

bool RESearch::GrabMatches(CharacterIndexer &ci) {

276

bool success = true;

277

for (unsigned int i = 0; i < MAXTAG; i++) {

278

if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {

279

unsigned int len = eopat[i] - bopat[i];

280

pat[i] = new char[len + 1];

281

if (pat[i]) {

282

for (unsigned int j = 0; j < len; j++)

283

pat[i][j] = ci.CharAt(bopat[i] + j);

284

pat[i][len] = '\0';

285

} else {

286

success = false;

287

}

288

}

289

}

290

return success;

291

}

292

293

void RESearch::ChSet(unsigned char c) {

294

bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];

295

}

296

297

void RESearch::ChSetWithCase(unsigned char c, bool caseSensitive) {

298

if (caseSensitive) {

299

ChSet(c);

300

} else {

301

if ((c >= 'a') && (c <= 'z')) {

302

ChSet(c);

303

ChSet(static_cast<unsigned char>(c - 'a' + 'A'));

304

} else if ((c >= 'A') && (c <= 'Z')) {

305

ChSet(c);

306

ChSet(static_cast<unsigned char>(c - 'A' + 'a'));

307

} else {

308

ChSet(c);

309

}

310

}

311

}

312

313

const unsigned char escapeValue(unsigned char ch) {

314

switch (ch) {

315

case 'a': return '\a';

316

case 'b': return '\b';

317

case 'f': return '\f';

318

case 'n': return '\n';

319

case 'r': return '\r';

320

case 't': return '\t';

321

case 'v': return '\v';

322

}

323

return 0;

324

}

325

326

static int GetHexaChar(unsigned char hd1, unsigned char hd2) {

327

int hexValue = 0;

328

if (hd1 >= '0' && hd1 <= '9') {

329

hexValue += 16 * (hd1 - '0');

330

} else if (hd1 >= 'A' && hd1 <= 'F') {

331

hexValue += 16 * (hd1 - 'A' + 10);

332

} else if (hd1 >= 'a' && hd1 <= 'f') {

333

hexValue += 16 * (hd1 - 'a' + 10);

334

} else

335

return -1;

336

if (hd2 >= '0' && hd2 <= '9') {

337

hexValue += hd2 - '0';

338

} else if (hd2 >= 'A' && hd2 <= 'F') {

339

hexValue += hd2 - 'A' + 10;

340

} else if (hd2 >= 'a' && hd2 <= 'f') {

341

hexValue += hd2 - 'a' + 10;

342

} else

343

return -1;

344

return hexValue;

345

}

346

347

/**

348

* Called when the parser finds a backslash not followed

349

* by a valid expression (like \( in non-Posix mode).

350

* @param pattern: pointer on the char after the backslash.

351

* @param incr: (out) number of chars to skip after expression evaluation.

352

* @return the char if it resolves to a simple char,

353

* or -1 for a char class. In this case, bittab is changed.

354

355

int RESearch::GetBackslashExpression(

356

const char *pattern,

357

int &incr) {

358

// Since error reporting is primitive and messages are not used anyway,

359

// I choose to interpret unexpected syntax in a logical way instead

360

// of reporting errors. Otherwise, we can stick on, eg., PCRE behavior.

361

incr = 0; // Most of the time, will skip the char "naturally".

362

int c;

363

int result = -1;

364

unsigned char bsc = *pattern;

365

if (!bsc) {

366

// Avoid overrun

367

result = '\\'; // \ at end of pattern, take it literally

368

return result;

369

}

370

371

switch (bsc) {

372

case 'a':

373

case 'b':

374

case 'n':

375

case 'f':

376

case 'r':

377

case 't':

378

case 'v':

379

result = escapeValue(bsc);

380

break;

381

case 'x': {

382

unsigned char hd1 = *(pattern + 1);

383

unsigned char hd2 = *(pattern + 2);

384

int hexValue = GetHexaChar(hd1, hd2);

385

if (hexValue >= 0) {

386

result = hexValue;

387

incr = 2; // Must skip the digits

388

} else {

389

result = 'x'; // \x without 2 digits: see it as 'x'

390

}

391

}

392

break;

393

case 'd':

394

for (c = '0'; c <= '9'; c++) {

395

ChSet(static_cast<unsigned char>(c));

396

}

397

break;

398

case 'D':

399

for (c = 0; c < MAXCHR; c++) {

400

if (c < '0' || c > '9') {

401

ChSet(static_cast<unsigned char>(c));

402

}

403

}

404

break;

405

case 's':

406

ChSet(' ');

407

ChSet('\t');

408

ChSet('\n');

409

ChSet('\r');

410

ChSet('\f');

411

ChSet('\v');

412

break;

413

case 'S':

414

for (c = 0; c < MAXCHR; c++) {

415

if (c != ' ' && !(c >= 0x09 && c <= 0x0D)) {

416

ChSet(static_cast<unsigned char>(c));

417

}

418

}

419

case 'w':

420

for (c = 0; c < MAXCHR; c++) {

421

if (iswordc(static_cast<unsigned char>(c))) {

422

ChSet(static_cast<unsigned char>(c));

423

}

424

}

425

break;

426

case 'W':

427

for (c = 0; c < MAXCHR; c++) {

428

if (!iswordc(static_cast<unsigned char>(c))) {

429

ChSet(static_cast<unsigned char>(c));

430

}

431

}

432

break;

433

default:

434

result = bsc;

435

}

436

return result;

437

}

438

439

const char *RESearch::Compile(const char *pattern, int length, bool caseSensitive, bool posix) {

440

char *mp=nfa; /* nfa pointer */

441

char *lp; /* saved pointer */

442

char *sp=nfa; /* another one */

443

char *mpMax = mp + MAXNFA - BITBLK - 10;

444

445

int tagi = 0; /* tag stack index */

446

int tagc = 1; /* actual tag count */

447

448

int n;

449

char mask; /* xor mask -CCL/NCL */

450

int c1, c2, prevChar;

451

452

if (!pattern || !length)

453

if (sta)

454

return 0;

455

else

456

return badpat("No previous regular expression");

457

sta = NOP;

458

459

const char *p=pattern; /* pattern pointer */

460

for (int i=0; i<length; i++, p++) {

461

if (mp > mpMax)

462

return badpat("Pattern too long");

463

lp = mp;

464

switch (*p) {

465

466

case '.': /* match any char */

467

*mp++ = ANY;

468

break;

469

470

case '^': /* match beginning */

471

if (p == pattern)

472

*mp++ = BOL;

473

else {

474

*mp++ = CHR;

475

*mp++ = *p;

476

}

477

break;

478

479

case '$': /* match endofline */

480

if (!*(p+1))

481

*mp++ = EOL;

482

else {

483

*mp++ = CHR;

484

*mp++ = *p;

485

}

486

break;

487

488

case '[': /* match char class */

489

*mp++ = CCL;

490

prevChar = 0;

491

492

i++;

493

if (*++p == '^') {

494

mask = '\377';

495

i++;

496

p++;

497

} else

498

mask = 0;

499

500

if (*p == '-') { /* real dash */

501

i++;

502

prevChar = *p;

503

ChSet(*p++);

504

}

505

if (*p == ']') { /* real brace */

506

i++;

507

prevChar = *p;

508

ChSet(*p++);

509

}

510

while (*p && *p != ']') {

511

if (*p == '-') {

512

if (prevChar < 0) {

513

// Previous def. was a char class like \d, take dash literally

514

prevChar = *p;

515

ChSet(*p);

516

} else if (*(p+1)) {

517

if (*(p+1) != ']') {

518

c1 = prevChar + 1;

519

i++;

520

c2 = *++p;

521

if (c2 == '\\') {

522

if (!*(p+1)) // End of RE

523

return badpat("Missing ]");

524

else {

525

i++;

526

p++;

527

int incr;

528

c2 = GetBackslashExpression(p, incr);

529

i += incr;

530

p += incr;

531

if (c2 >= 0) {

532

// Convention: \c (c is any char) is case sensitive, whatever the option

533

ChSet(static_cast<unsigned char>(c2));

534

prevChar = c2;

535

} else {

536

// bittab is already changed

537

prevChar = -1;

538

}

539

}

540

}

541

if (prevChar < 0) {

542

// Char after dash is char class like \d, take dash literally

543

prevChar = '-';

544

ChSet('-');

545

} else {

546

// Put all chars between c1 and c2 included in the char set

547

while (c1 <= c2) {

548

ChSetWithCase(static_cast<unsigned char>(c1++), caseSensitive);

549

}

550

}

551

} else {

552

// Dash before the ], take it literally

553

prevChar = *p;

554

ChSet(*p);

555

}

556

} else {

557

return badpat("Missing ]");

558

}

559

} else if (*p == '\\' && *(p+1)) {

560

i++;

561

p++;

562

int incr;

563

int c = GetBackslashExpression(p, incr);

564

i += incr;

565

p += incr;

566

if (c >= 0) {

567

// Convention: \c (c is any char) is case sensitive, whatever the option

568

ChSet(static_cast<unsigned char>(c));

569

prevChar = c;

570

} else {

571

// bittab is already changed

572

prevChar = -1;

573

}

574

} else {

575

prevChar = *p;

576

ChSetWithCase(*p, caseSensitive);

577

}

578

i++;

579

p++;

580

}

581

if (!*p)

582

return badpat("Missing ]");

583

584

for (n = 0; n < BITBLK; bittab[n++] = 0)

585

*mp++ = static_cast<char>(mask ^ bittab[n]);

586

587

break;

588

589

case '*': /* match 0 or more... */

590

case '+': /* match 1 or more... */

591

if (p == pattern)

592

return badpat("Empty closure");

593

lp = sp; /* previous opcode */

594

if (*lp == CLO) /* equivalence... */

595

break;

596

switch (*lp) {

597

598

case BOL:

599

case BOT:

600

case EOT:

601

case BOW:

602

case EOW:

603

case REF:

604

return badpat("Illegal closure");

605

default:

606

break;

607

}

608

609

if (*p == '+')

610

for (sp = mp; lp < sp; lp++)

611

*mp++ = *lp;

612

613

*mp++ = END;

614

*mp++ = END;

615

sp = mp;

616

while (--mp > lp)

617

*mp = mp[-1];

618

*mp = CLO;

619

mp = sp;

620

break;

621

622

case '\\': /* tags, backrefs... */

623

i++;

624

switch (*++p) {

625

case '<':

626

*mp++ = BOW;

627

break;

628

case '>':

629

if (*sp == BOW)

630

return badpat("Null pattern inside \\<\\>");

631

*mp++ = EOW;

632

break;

633

case '1':

634

case '2':

635

case '3':

636

case '4':

637

case '5':

638

case '6':

639

case '7':

640

case '8':

641

case '9':

642

n = *p-'0';

643

if (tagi > 0 && tagstk[tagi] == n)

644

return badpat("Cyclical reference");

645

if (tagc > n) {

646

*mp++ = static_cast<char>(REF);

647

*mp++ = static_cast<char>(n);

648

} else

649

return badpat("Undetermined reference");

650

break;

651

default:

652

if (!posix && *p == '(') {

653

if (tagc < MAXTAG) {

654

tagstk[++tagi] = tagc;

655

*mp++ = BOT;

656

*mp++ = static_cast<char>(tagc++);

657

} else

658

return badpat("Too many \$\$ pairs");

659

} else if (!posix && *p == ')') {

660

if (*sp == BOT)

661

return badpat("Null pattern inside \$\$");

662

if (tagi > 0) {

663

*mp++ = static_cast<char>(EOT);

664

*mp++ = static_cast<char>(tagstk[tagi--]);

665

} else

666

return badpat("Unmatched \\)");

667

} else {

668

int incr;

669

int c = GetBackslashExpression(p, incr);

670

i += incr;

671

p += incr;

672

if (c >= 0) {

673

*mp++ = CHR;

674

*mp++ = static_cast<unsigned char>(c);

675

} else {

676

*mp++ = CCL;

677

mask = 0;

678

for (n = 0; n < BITBLK; bittab[n++] = 0)

679

*mp++ = static_cast<char>(mask ^ bittab[n]);

680

}

681

}

682

}

683

break;

684

685

default : /* an ordinary char */

686

if (posix && *p == '(') {

687

if (tagc < MAXTAG) {

688

tagstk[++tagi] = tagc;

689

*mp++ = BOT;

690

*mp++ = static_cast<char>(tagc++);

691

} else

692

return badpat("Too many () pairs");

693

} else if (posix && *p == ')') {

694

if (*sp == BOT)

695

return badpat("Null pattern inside ()");

696

if (tagi > 0) {

697

*mp++ = static_cast<char>(EOT);

698

*mp++ = static_cast<char>(tagstk[tagi--]);

699

} else

700

return badpat("Unmatched )");

701

} else {

702

unsigned char c = *p;

703

if (!c) // End of RE

704

c = '\\'; // We take it as raw backslash

705

if (caseSensitive || !iswordc(c)) {

706

*mp++ = CHR;

707

*mp++ = c;

708

} else {

709

*mp++ = CCL;

710

mask = 0;

711

ChSetWithCase(c, false);

712

for (n = 0; n < BITBLK; bittab[n++] = 0)

713

*mp++ = static_cast<char>(mask ^ bittab[n]);

714

}

715

}

716

break;

717

}

718

sp = lp;

719

}

720

if (tagi > 0)

721

return badpat((posix ? "Unmatched (" : "Unmatched \\("));

722

*mp = END;

723

sta = OKP;

724

return 0;

725

}

726

727

728

* RESearch::Execute:

729

* execute nfa to find a match.

730

731

* special cases: (nfa[0])

732

* BOL

733

* Match only once, starting from the

734

* beginning.

735

* CHR

736

* First locate the character without

737

* calling PMatch, and if found, call

738

* PMatch for the remaining string.

739

* END

740

* RESearch::Compile failed, poor luser did not

741

* check for it. Fail fast.

742

743

* If a match is found, bopat[0] and eopat[0] are set

744

* to the beginning and the end of the matched fragment,

745

* respectively.

746

747

748

int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {

749

unsigned char c;

750

int ep = NOTFOUND;

751

char *ap = nfa;

752

753

bol = lp;

754

failure = 0;

755

756

Clear();

757

758

switch (*ap) {

759

760

case BOL: /* anchored: match from BOL only */

761

ep = PMatch(ci, lp, endp, ap);

762

break;

763

case EOL: /* just searching for end of line normal path doesn't work */

764

if (*(ap+1) == END) {

765

lp = endp;

766

ep = lp;

767

break;

768

} else {

769

return 0;

770

}

771

case CHR: /* ordinary char: locate it fast */

772

c = *(ap+1);

773

while ((lp < endp) && (ci.CharAt(lp) != c))

774

lp++;

775

if (lp >= endp) /* if EOS, fail, else fall thru. */

776

return 0;

777

default: /* regular matching all the way. */

778

while (lp < endp) {

779

ep = PMatch(ci, lp, endp, ap);

780

if (ep != NOTFOUND)

781

break;

782

lp++;

783

}

784

break;

785

case END: /* munged automaton. fail always */

786

return 0;

787

}

788

if (ep == NOTFOUND)

789

return 0;

790

791

bopat[0] = lp;

792

eopat[0] = ep;

793

return 1;

794

}

795

796

797

* PMatch: internal routine for the hard part

798

799

* This code is partly snarfed from an early grep written by

800

* David Conroy. The backref and tag stuff, and various other

801

* innovations are by oz.

802

803

* special case optimizations: (nfa[n], nfa[n+1])

804

* CLO ANY

805

* We KNOW .* will match everything upto the

806

* end of line. Thus, directly go to the end of

807

* line, without recursive PMatch calls. As in

808

* the other closure cases, the remaining pattern

809

* must be matched by moving backwards on the

810

* string recursively, to find a match for xy

811

* (x is ".*" and y is the remaining pattern)

812

* where the match satisfies the LONGEST match for

813

* x followed by a match for y.

814

* CLO CHR

815

* We can again scan the string forward for the

816

* single char and at the point of failure, we

817

* execute the remaining nfa recursively, same as

818

* above.

819

820

* At the end of a successful match, bopat[n] and eopat[n]

821

* are set to the beginning and end of subpatterns matched

822

* by tagged expressions (n = 1 to 9).

823

824

825

extern void re_fail(char *,char);

826

827

#define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])

828

829

830

* skip values for CLO XXX to skip past the closure

831

832

833

#define ANYSKIP 2 /* [CLO] ANY END */

834

#define CHRSKIP 3 /* [CLO] CHR chr END */

835

#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */

836

837

int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {

838

int op, c, n;

839

int e; /* extra pointer for CLO */

840

int bp; /* beginning of subpat... */

841

int ep; /* ending of subpat... */

842

int are; /* to save the line ptr. */

843

844

while ((op = *ap++) != END)

845

switch (op) {

846

847

case CHR:

848

if (ci.CharAt(lp++) != *ap++)

849

return NOTFOUND;

850

break;

851

case ANY:

852

if (lp++ >= endp)

853

return NOTFOUND;

854

break;

855

case CCL:

856

if (lp >= endp)

857

return NOTFOUND;

858

c = ci.CharAt(lp++);

859

if (!isinset(ap,c))

860

return NOTFOUND;

861

ap += BITBLK;

862

break;

863

case BOL:

864

if (lp != bol)

865

return NOTFOUND;

866

break;

867

case EOL:

868

if (lp < endp)

869

return NOTFOUND;

870

break;

871

case BOT:

872

bopat[*ap++] = lp;

873

break;

874

case EOT:

875

eopat[*ap++] = lp;

876

break;

877

case BOW:

878

if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))

879

return NOTFOUND;

880

break;

881

case EOW:

882

if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))

883

return NOTFOUND;

884

break;

885

case REF:

886

n = *ap++;

887

bp = bopat[n];

888

ep = eopat[n];

889

while (bp < ep)

890

if (ci.CharAt(bp++) != ci.CharAt(lp++))

891

return NOTFOUND;

892

break;

893

case CLO:

894

are = lp;

895

switch (*ap) {

896

897

case ANY:

898

while (lp < endp)

899

lp++;

900

n = ANYSKIP;

901

break;

902

case CHR:

903

c = *(ap+1);

904

while ((lp < endp) && (c == ci.CharAt(lp)))

905

lp++;

906

n = CHRSKIP;

907

break;

908

case CCL:

909

while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))

910

lp++;

911

n = CCLSKIP;

912

break;

913

default:

914

failure = true;

915

//re_fail("closure: bad nfa.", *ap);

916

return NOTFOUND;

917

}

918

919

ap += n;

920

921

while (lp >= are) {

922

if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)

923

return e;

924

--lp;

925

}

926

return NOTFOUND;

927

default:

928

//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));

929

return NOTFOUND;

930

}

931

return lp;

932

}

933

934

935

* RESearch::Substitute:

936

* substitute the matched portions of the src in dst.

937

938

* & substitute the entire matched pattern.

939

940

* \digit substitute a subpattern, with the given tag number.

941

* Tags are numbered from 1 to 9. If the particular

942

* tagged subpattern does not exist, null is substituted.

943

944

int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {

945

unsigned char c;

946

int pin;

947

int bp;

948

int ep;

949

950

if (!*src || !bopat[0])

951

return 0;

952

953

while ((c = *src++) != 0) {

954

switch (c) {

955

956

case '&':

957

pin = 0;

958

break;

959

960

case '\\':

961

c = *src++;

962

if (c >= '0' && c <= '9') {

963

pin = c - '0';

964

break;

965

}

966

967

default:

968

*dst++ = c;

969

continue;

970

}

971

972

if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {

973

while (ci.CharAt(bp) && bp < ep)

974

*dst++ = ci.CharAt(bp++);

975

if (bp < ep)

976

return 0;

977

}

978

}

979

*dst = '\0';

980

return 1;

981

}

982

Older »