~ubuntu-branches/ubuntu/utopic/cccc/utopic

« back to all changes in this revision

Viewing changes to pccts/CHANGES_FROM_133_BEFORE_MR13.txt

Committer: Bazaar Package Importer
Author(s): Colin Watson
Date: 2003-08-23 04:34:05 UTC
Revision ID: james.westby@ubuntu.com-20030823043405-xnzd3mn3hwtvi6dr

Tags: upstream-3.pre81

Import upstream version 3.pre81

files added:

bug_reports

bug_reports/prn10.html

bug_reports/prn11.html

bug_reports/prn12.html

bug_reports/prn8.html

bug_reports/prn9.html

build_posixgcc.sh

build_w32bcc.PIF

build_w32bcc.bat

build_w32vc.PIF

build_w32vc.bat

cccc

cccc/CCCC User Guide.html

cccc/Cccc.dsp

cccc/ada.g

cccc/cccc.g

cccc/cccc.h

cccc/cccc_ast.cc

cccc/cccc_ast.h

cccc/cccc_db.cc

cccc/cccc_db.h

cccc/cccc_ext.cc

cccc/cccc_ext.h

cccc/cccc_htm.cc

cccc/cccc_htm.h

cccc/cccc_itm.cc

cccc/cccc_itm.h

cccc/cccc_mem.cc

cccc/cccc_mem.h

cccc/cccc_met.cc

cccc/cccc_met.h

cccc/cccc_mod.cc

cccc/cccc_mod.h

cccc/cccc_new.cc

cccc/cccc_opt.cc

cccc/cccc_opt.h

cccc/cccc_prj.cc

cccc/cccc_prj.h

cccc/cccc_rec.cc

cccc/cccc_rec.h

cccc/cccc_tbl.cc

cccc/cccc_tbl.h

cccc/cccc_tok.cc

cccc/cccc_tok.h

cccc/cccc_tpl.cc

cccc/cccc_use.cc

cccc/cccc_use.h

cccc/cccc_utl.cc

cccc/cccc_utl.h

cccc/cccc_ver.h

cccc/cccc_xml.cc

cccc/cccc_xml.h

cccc/ccccmain.cc

cccc/java.g

cccc/posixgcc.mak

cccc/rules.mak

cccc/w32bcb.mak

cccc/w32bcc.mak

cccc/w32bcc55.mak

cccc/w32cygnus.mak

cccc/w32cygnus_gmake.mak

cccc/w32cygnus_nmake.mak

cccc/w32vc.mak

ccccdist.dsw

changes.txt

documentation.dsp

install

install/install.dsp

install/install.mak

makefile

pccts

pccts/CHANGES_FROM_131.txt

pccts/CHANGES_FROM_133.txt

pccts/CHANGES_FROM_133_BEFORE_MR13.txt

pccts/IBM_VISUAL_AGE_PROJECTS

pccts/IBM_VISUAL_AGE_PROJECTS/antlr.icc

pccts/IBM_VISUAL_AGE_PROJECTS/dlg.icc

pccts/IBM_VISUAL_AGE_PROJECTS/sorcerer.icc

pccts/KNOWN_PROBLEMS.txt

pccts/MPW_Read_Me

pccts/NOTES.OS2

pccts/NOTES.bcc

pccts/NOTES.msvc

pccts/NOTES.watcom

pccts/README

pccts/RIGHTS

pccts/antlr

pccts/antlr/Antlr.SUP

pccts/antlr/AntlrMS.mak

pccts/antlr/AntlrMSVC50.dsp

pccts/antlr/AntlrMSVC50.dsw

pccts/antlr/AntlrMSVC50.mak

pccts/antlr/AntlrMSVC60.dsp

pccts/antlr/AntlrMSVC60.dsw

pccts/antlr/README

pccts/antlr/antlr.1

pccts/antlr/antlr.c

pccts/antlr/antlr.g

pccts/antlr/antlr.r

pccts/antlr/antlr1.txt

pccts/antlr/antlr68K.make

pccts/antlr/antlrPPC.make

pccts/antlr/bits.c

pccts/antlr/build.c

pccts/antlr/dumpcycles.c

pccts/antlr/dumpnode.c

pccts/antlr/egman.c

pccts/antlr/err.c

pccts/antlr/fcache.c

pccts/antlr/fset.c

pccts/antlr/fset2.c

pccts/antlr/gen.c

pccts/antlr/generic.h

pccts/antlr/globals.c

pccts/antlr/hash.c

pccts/antlr/hash.h

pccts/antlr/lex.c

pccts/antlr/main.c

pccts/antlr/makefile

pccts/antlr/makefile.VMS

pccts/antlr/makefile1

pccts/antlr/misc.c

pccts/antlr/mode.h

pccts/antlr/mrhoist.c

pccts/antlr/parser.dlg

pccts/antlr/pred.c

pccts/antlr/proto.h

pccts/antlr/scan.c

pccts/antlr/stdpccts.h

pccts/antlr/syn.h

pccts/antlr/tokens.h

pccts/antlr/watantlr.mak

pccts/dlg

pccts/dlg/DlgMS.mak

pccts/dlg/DlgMSVC50.dsp

pccts/dlg/DlgMSVC50.dsw

pccts/dlg/DlgMSVC60.dsp

pccts/dlg/DlgMSVC60.dsw

pccts/dlg/automata.c

pccts/dlg/dlg.1

pccts/dlg/dlg.h

pccts/dlg/dlg.r

pccts/dlg/dlg1.txt

pccts/dlg/dlg68K.make

pccts/dlg/dlgPPC.make

pccts/dlg/dlg_a.c

pccts/dlg/dlg_p.c

pccts/dlg/dlg_p.g

pccts/dlg/err.c

pccts/dlg/main.c

pccts/dlg/makefile

pccts/dlg/makefile.VMS

pccts/dlg/makefile1

pccts/dlg/mode.h

pccts/dlg/output.c

pccts/dlg/parser.dlg

pccts/dlg/relabel.c

pccts/dlg/stdpccts.h

pccts/dlg/support.c

pccts/dlg/tokens.h

pccts/dlg/watdlg.mak

pccts/h

pccts/h/AParser.cpp

pccts/h/AParser.h

pccts/h/ASTBase.cpp

pccts/h/ASTBase.h

pccts/h/ATokPtr.cpp

pccts/h/ATokPtr.h

pccts/h/AToken.h

pccts/h/ATokenBuffer.cpp

pccts/h/ATokenBuffer.h

pccts/h/ATokenStream.h

pccts/h/AToken_traditional.h

pccts/h/BufFileInput.cpp

pccts/h/BufFileInput.h

pccts/h/DLG_stream_input.h

pccts/h/DLexer.cpp

pccts/h/DLexerBase.cpp

pccts/h/DLexerBase.h

pccts/h/PBlackBox.h

pccts/h/PCCTSAST.cpp

pccts/h/PCCTSAST.h

pccts/h/SList.h

pccts/h/antlr.h

pccts/h/ast.c

pccts/h/ast.h

pccts/h/charbuf.h

pccts/h/charptr.c

pccts/h/charptr.h

pccts/h/config.h

pccts/h/dlgauto.h

pccts/h/dlgdef.h

pccts/h/err.h

pccts/h/int.h

pccts/h/pccts_assert.h

pccts/h/pccts_iostream.h

pccts/h/pccts_istream.h

pccts/h/pccts_setjmp.h

pccts/h/pccts_stdarg.h

pccts/h/pccts_stdio.h

pccts/h/pccts_stdlib.h

pccts/h/pccts_string.h

pccts/h/pcctscfg.h

pccts/h/pcctslib50.dsp

pccts/h/pcctslib50.dsw

pccts/h/pcctslib60.dsp

pccts/h/pcctslib60.dsw

pccts/h/pcnames.bat

pccts/h/slist.cpp

pccts/history.ps

pccts/history.txt

pccts/install68K.mpw

pccts/installPPC.mpw

pccts/makefile

pccts/old_README

pccts/support

pccts/support/DECmms

pccts/support/DECmms/genmms.c

pccts/support/DECmms/makefile.VMS

pccts/support/genmk

pccts/support/genmk/genmk.c

pccts/support/genmk/makefile

pccts/support/genmk/watgenmk.mak

pccts/support/msvc.dsp

pccts/support/rexpr

pccts/support/rexpr/makefile

pccts/support/rexpr/rexpr.c

pccts/support/rexpr/rexpr.h

pccts/support/rexpr/test.c

pccts/support/set

pccts/support/set/set.c

pccts/support/set/set.h

pccts/support/sym

pccts/support/sym/sym.c

pccts/support/sym/template.h

readme.txt

test

test/diff.bat

test/posix.mak

test/prn1.cc

test/prn1.dbref

test/prn1.htmlref

test/prn1.xmlref

test/prn10.cc

test/prn10.dbref

test/prn10.htmlref

test/prn10.xmlref

test/prn11.cc

test/prn11.dbref

test/prn11.htmlref

test/prn11.xmlref

test/prn12.cc

test/prn12.dbref

test/prn12.htmlref

test/prn12.xmlref

test/prn13.dbref

test/prn13.htmlref

test/prn13.java

test/prn13.xmlref

test/prn14.dbref

test/prn14.htmlref

test/prn14.java

test/prn14.xmlref

test/prn15.dbref

test/prn15.htmlref

test/prn15.java

test/prn15.xmlref

test/prn2.cc

test/prn2.dbref

test/prn2.htmlref

test/prn2.xmlref

test/prn3.cc

test/prn3.dbref

test/prn3.htmlref

test/prn3.xmlref

test/prn4.cc

test/prn4.dbref

test/prn4.htmlref

test/prn4.xmlref

test/prn5.cc

test/prn5.dbref

test/prn5.htmlref

test/prn5.xmlref

test/prn6.cc

test/prn6.dbref

test/prn6.htmlref

test/prn6.xmlref

test/prn7.c

test/prn7.dbref

test/prn7.htmlref

test/prn7.xmlref

test/prn8.dbref

test/prn8.htmlref

test/prn8.java

test/prn8.out

test/prn8.xmlref

test/prn9.cc

test/prn9.dbref

test/prn9.htmlref

test/prn9.xmlref

test/rules.mak

test/test.dsp

test/test1.cc

test/test1.dbref

test/test1.htmlref

test/test1.xmlref

test/test2.cc

test/test2.dbref

test/test2.htmlref

test/test2.xmlref

test/test3.cc

test/test3.dbref

test/test3.htmlref

test/test3.xmlref

test/test4.cc

test/test4.dbref

test/test4.htmlref

test/test4.opt

test/test4.xmlref

test/vanilla.opt

test/w32bcc.mak

test/w32vc.mak

test/win32_nmake.mak

vcaddin

vcaddin/CcccDevStudioAddIn.cpp

vcaddin/CcccDevStudioAddIn.def

vcaddin/CcccDevStudioAddIn.dsp

vcaddin/CcccDevStudioAddIn.h

vcaddin/CcccDevStudioAddIn.mak

vcaddin/CcccDevStudioAddIn.odl

vcaddin/CcccDevStudioAddIn.rc

vcaddin/CommandForm.cpp

vcaddin/CommandForm.h

vcaddin/Commands.cpp

vcaddin/Commands.h

vcaddin/DSAddIn.cpp

vcaddin/DSAddIn.h

vcaddin/DevStudioFileStore.cpp

vcaddin/DevStudioFileStore.h

vcaddin/FileList.cpp

vcaddin/FileList.h

vcaddin/ReadMe.txt

vcaddin/StdAfx.cpp

vcaddin/StdAfx.h

vcaddin/WorkspaceInfo.cpp

vcaddin/WorkspaceInfo.h

vcaddin/res

vcaddin/res/CcccDevStudioAddIn.rc2

vcaddin/res/TBarLrge.bmp

vcaddin/res/TBarMedm.bmp

vcaddin/resource.h

w32installer

w32installer/cccc.iss

w32installer/ccccwrap.bat

w32installer/make_cccc_env.bat

Show diffs side-by-side

added added

removed removed

pccts/CHANGES_FROM_133_BEFORE_MR13.txt

------------------------------------------------------------

This is the second part of a two part file.

This is a list of changes to pccts 1.33 prior to MR13

For more recent information see CHANGES_FROM_133.txt

------------------------------------------------------------

DISCLAIMER

The software and these notes are provided "as is". They may include

typographical or technical errors and their authors disclaims all

liability of any kind or nature for damages due to error, fault,

defect, or deficiency regardless of cause. All warranties of any

kind, either express or implied, including, but not limited to, the

implied warranties of merchantability and fitness for a particular

purpose are disclaimed.

#153. (Changed in MR12b) Bug in computation of -mrhoist suppression set

Consider the following grammar with k=1 and "-mrhoist on":

r1 : (A)? => ((p>>? x /* l1 */

| r2 /* l2 */

;

r2 : A /* l4 */

| (B)? => <<q>>? y /* l5 */

;

In earlier versions the mrhoist routine would see that both l1 and

l2 contained predicates and would assume that this prevented either

from acting to suppress the other predicate. In the example above

it didn't realize the A at line l4 is capable of suppressing the

predicate at l1 even though alt l2 contains (indirectly) a predicate.

This is fixed in MR12b.

Reported by Reinier van den Born (reinier@vnet.ibm.com)

#153. (Changed in MR12a) Bug in computation of -mrhoist suppression set

An oversight similar to that described in Item #152 appeared in

the computation of the set that "covered" a predicate. If a

predicate expression included a term such as p=AND(q,r) the context

of p was taken to be context(q) & context(r), when it should have

been context(q) | context(r). This is fixed in MR12a.

#152. (Changed in MR12) Bug in generation of predicate expressions

The primary purpose for MR12 is to make quite clear that MR11 is

obsolete and to fix the bug related to predicate expressions.

In MR10 code was added to optimize the code generated for

predicate expression tests. Unfortunately, there was a

significant oversight in the code which resulted in a bug in

the generation of code for predicate expression tests which

contained predicates combined using AND:

r0 : (r1)* "@" ;

r1 : (AAA)? => <>? r2 ;

r2 : (BBB)? => <<q LATEXT(1)>>? Q

| (BBB)? => <<r LATEXT(1)>>? Q

;

In MR11 (and MR10 when using "-mrhoist on") the code generated

for r0 to predict r1 would be equivalent to:

if ( LA(1)==Q &&

(LA(1)==AAA && LA(1)==BBB) &&

( p && ( q || r )) ) {

This is incorrect because it expresses the idea that LA(1)

*must* be AAA in order to attempt r1, and *must* be BBB to

attempt r2. The result was that r1 became unreachable since

both condition can not be simultaneously true.

The general philosophy of code generation for predicates

can be summarized as follows:

a. If the context is true don't enter an alt

for which the corresponding predicate is false.

If the context is false then it is okay to enter

the alt without evaluating the predicate at all.

b. A predicate created by ORing of predicates has

context which is the OR of their individual contexts.

c. A predicate created by ANDing of predicates has

(surprise) context which is the OR of their individual

contexts.

d. Apply these rules recursively.

e. Remember rule (a)

The correct code should express the idea that *if* LA(1) is

AAA then p must be true to attempt r1, but if LA(1) is *not*

AAA then it is okay to attempt r1, provided that *if* LA(1) is

100

BBB then one of q or r must be true.

101

102

if ( LA(1)==Q &&

103

( !(LA(1)==AAA || LA(1)==BBB) ||

104

( ! LA(1) == AAA || p) &&

105

( ! LA(1) == BBB || q || r ) ) ) {

106

107

I believe this is fixed in MR12.

108

109

Reported by Reinier van den Born (reinier@vnet.ibm.com)

110

111

#151a. (Changed in MR12) ANTLRParser::getLexer()

112

113

As a result of several requests, I have added public methods to

114

get a pointer to the lexer belonging to a parser.

115

116

ANTLRTokenStream *ANTLRParser::getLexer() const

117

118

Returns a pointer to the lexer being used by the

119

parser. ANTLRTokenStream is the base class of

120

DLGLexer

121

122

ANTLRTokenStream *ANTLRTokenBuffer::getLexer() const

123

124

Returns a pointer to the lexer being used by the

125

ANTLRTokenBuffer. ANTLRTokenStream is the base

126

class of DLGLexer

127

128

You must manually cast the ANTLRTokenStream to your program's

129

lexer class. Because the name of the lexer's class is not fixed.

130

Thus it is impossible to incorporate it into the DLGLexerBase

131

class.

132

133

#151b.(Changed in MR12) ParserBlackBox member getLexer()

134

135

The template class ParserBlackBox now has a member getLexer()

136

which returns a pointer to the lexer.

137

138

#150. (Changed in MR12) syntaxErrCount and lexErrCount now public

139

140

See Item #127 for more information.

141

142

#149. (Changed in MR12) antlr option -info o (letter o for orphan)

143

144

If there is more than one rule which is not referenced by any

145

other rule then all such rules are listed. This is useful for

146

alerting one to rules which are not used, but which can still

147

contribute to ambiguity. For example:

148

149

start : a Z ;

150

unused: a A ;

151

a : (A)+ ;

152

153

will cause an ambiguity report for rule "a" which will be

154

difficult to understand if the user forgets about rule "unused"

155

simply because it is not used in the grammar.

156

157

#148. (Changed in MR11) #token names appearing in zztokens,token_tbl

158

159

In a #token statement like the following:

160

161

#token Plus "\+"

162

163

the string "Plus" appears in the zztokens array (C mode) and

164

token_tbl (C++ mode). This string is used in most error

165

messages. In MR11 one has the option of using some other string,

166

(e.g. "+") in those tables.

167

168

In MR11 one can write:

169

170

#token Plus ("+") "\+"

171

#token RP ("(") "\("

172

#token COM ("comment begin") "/\*"

173

174

A #token statement is allowed to appear in more than one #lexclass

175

with different regular expressions. However, the token name appears

176

only once in the zztokens/token_tbl array. This means that only

177

one substitute can be specified for a given #token name. The second

178

attempt to define a substitute name (different from the first) will

179

result in an error message.

180

181

#147. (Changed in MR11) Bug in follow set computation

182

183

There is a bug in 1.33 vanilla and all maintenance releases

184

prior to MR11 in the computation of the follow set. The bug is

185

different than that described in Item #82 and probably more

186

common. It was discovered in the ansi.g grammar while testing

187

the "ambiguity aid" (Item #119). The search for a bug started

188

when the ambiguity aid was unable to discover the actual source

189

of an ambiguity reported by antlr.

190

191

The problem appears when an optimization of the follow set

192

computation is used inappropriately. The result is that the

193

follow set used is the "worst case". In other words, the error

194

can lead to false reports of ambiguity. The good news is that

195

if you have a grammar in which you have addressed all reported

196

ambiguities you are ok. The bad news is that you may have spent

197

time fixing ambiguities that were not real, or used k=2 when

198

ck=2 might have been sufficient, and so on.

199

200

The following grammar demonstrates the problem:

201

202

------------------------------------------------------------

203

expr : ID ;

204

205

start : stmt SEMI ;

206

207

stmt : CASE expr COLON

208

| expr SEMI

209

| plain_stmt

210

;

211

212

plain_stmt : ID COLON ;

213

------------------------------------------------------------

214

215

When compiled with k=1 and ck=2 it will report:

216

217

warning: alts 2 and 3 of the rule itself ambiguous upon

218

{ IDENTIFIER }, { COLON }

219

220

When antlr analyzes "stmt" it computes the first[1] set of all

221

alternatives. It finds an ambiguity between alts 2 and 3 for ID.

222

It then computes the first[2] set for alternatives 2 and 3 to resolve

223

the ambiguity. In computing the first[2] set of "expr" (which is

224

only one token long) it needs to determine what could follow "expr".

225

Under a certain combination of circumstances antlr forgets that it

226

is trying to analyze "stmt" which can only be followed by SEMI and

227

adds to the first[2] set of "expr" the "global" follow set (including

228

"COLON") which could follow "expr" (under other conditions) in the

229

phrase "CASE expr COLON".

230

231

#146. (Changed in MR11) Option -treport for locating "difficult" alts

232

233

It can be difficult to determine which alternatives are causing

234

pccts to work hard to resolve an ambiguity. In some cases the

235

ambiguity is successfully resolved after much CPU time so there

236

is no message at all.

237

238

A rough measure of the amount of work being peformed which is

239

independent of the CPU speed and system load is the number of

240

tnodes created. Using "-info t" gives information about the

241

total number of tnodes created and the peak number of tnodes.

242

243

Tree Nodes: peak 1300k created 1416k lost 0

244

245

It also puts in the generated C or C++ file the number of tnodes

246

created for a rule (at the end of the rule). However this

247

information is not sufficient to locate the alternatives within

248

a rule which are causing the creation of tnodes.

249

250

Using:

251

252

antlr -treport 100000 ....

253

254

causes antlr to list on stdout any alternatives which require the

255

creation of more than 100,000 tnodes, along with the lookahead sets

256

for those alternatives.

257

258

The following is a trivial case from the ansi.g grammar which shows

259

the format of the report. This report might be of more interest

260

in cases where 1,000,000 tuples were created to resolve the ambiguity.

261

262

-------------------------------------------------------------------------

263

There were 0 tuples whose ambiguity could not be resolved

264

by full lookahead

265

There were 157 tnodes created to resolve ambiguity between:

266

267

Choice 1: statement/2 line 475 file ansi.g

268

Choice 2: statement/3 line 476 file ansi.g

269

270

Intersection of lookahead[1] sets:

271

272

IDENTIFIER

273

274

Intersection of lookahead[2] sets:

275

276

LPARENTHESIS COLON AMPERSAND MINUS

277

STAR PLUSPLUS MINUSMINUS ONESCOMPLEMENT

278

NOT SIZEOF OCTALINT DECIMALINT

279

HEXADECIMALINT FLOATONE FLOATTWO IDENTIFIER

280

STRING CHARACTER

281

-------------------------------------------------------------------------

282

283

#145. (Documentation) Generation of Expression Trees

284

285

Item #99 was misleading because it implied that the optimization

286

for tree expressions was available only for trees created by

287

predicate expressions and neglected to mention that it required

288

the use of "-mrhoist on". The optimization applies to tree

289

expressions created for grammars with k>1 and for predicates with

290

lookahead depth >1.

291

292

In MR11 the optimized version is always used so the -mrhoist on

293

option need not be specified.

294

295

#144. (Changed in MR11) Incorrect test for exception group

296

297

In testing for a rule's exception group the label a pointer

298

is compared against '\0'. The intention is "*pointer".

299

300

Reported by Jeffrey C. Fried (Jeff@Fried.net).

301

302

#143. (Changed in MR11) Optional ";" at end of #token statement

303

304

Fixes problem of:

305

306

#token X "x"

307

308

309

parser action

310

311

312

Being confused with:

313

314

#token X "x" <<lexical action>>

315

316

#142. (Changed in MR11) class BufFileInput subclass of DLGInputStream

317

318

Alexey Demakov (demakov@kazbek.ispras.ru) has supplied class

319

BufFileInput derived from DLGInputStream which provides a

320

function lookahead(char *string) to test characters in the

321

input stream more than one character ahead.

322

323

The default amount of lookahead is specified by the constructor

324

and defaults to 8 characters. This does *not* include the one

325

character of lookahead maintained internally by DLG in member "ch"

326

and which is not available for testing via BufFileInput::lookahead().

327

328

This is a useful class for overcoming the one-character-lookahead

329

limitation of DLG without resorting to a lexer capable of

330

backtracking (like flex) which is not integrated with antlr as is

331

DLG.

332

333

There are no restrictions on copying or using BufFileInput.* except

334

that the authorship and related information must be retained in the

335

source code.

336

337

The class is located in pccts/h/BufFileInput.* of the kit.

338

339

#141. (Changed in MR11) ZZDEBUG_CONSUME for ANTLRParser::consume()

340

341

A debug aid has been added to file ANTLRParser::consume() in

342

file AParser.cpp:

343

344

#ifdef ZZDEBUG_CONSUME_ACTION

345

zzdebug_consume_action();

346

#endif

347

348

Suggested by Sramji Ramanathan (ps@kumaran.com).

349

350

#140. (Changed in MR11) #pred to define predicates

351

352

+---------------------------------------------------+

353

| Note: Assume "-prc on" for this entire discussion |

354

+---------------------------------------------------+

355

356

A problem with predicates is that each one is regarded as

357

unique and capable of disambiguating cases where two

358

alternatives have identical lookahead. For example:

359

360

rule : <<pred(LATEXT(1))>>? A

361

| <<pred(LATEXT(1))>>? A

362

;

363

364

will not cause any error messages or warnings to be issued

365

by earlier versions of pccts. To compare the text of the

366

predicates is an incomplete solution.

367

368

In 1.33MR11 I am introducing the #pred statement in order to

369

solve some problems with predicates. The #pred statement allows

370

one to give a symbolic name to a "predicate literal" or a

371

"predicate expression" in order to refer to it in other predicate

372

expressions or in the rules of the grammar.

373

374

The predicate literal associated with a predicate symbol is C

375

or C++ code which can be used to test the condition. A

376

predicate expression defines a predicate symbol in terms of other

377

predicate symbols using "!", "&&", and "||". A predicate symbol

378

can be defined in terms of a predicate literal, a predicate

379

expression, or *both*.

380

381

When a predicate symbol is defined with both a predicate literal

382

and a predicate expression, the predicate literal is used to generate

383

code, but the predicate expression is used to check for two

384

alternatives with identical predicates in both alternatives.

385

386

Here are some examples of #pred statements:

387

388

#pred IsLabel <<isLabel(LATEXT(1))>>?

389

#pred IsLocalVar <<isLocalVar(LATEXT(1))>>?

390

#pred IsGlobalVar <<isGlobalVar(LATEXT(1)>>?

391

#pred IsVar <<isVar(LATEXT(1))>>? IsLocalVar || IsGlobalVar

392

#pred IsScoped <<isScoped(LATEXT(1))>>? IsLabel || IsLocalVar

393

394

I hope that the use of EBNF notation to describe the syntax of the

395

#pred statement will not cause problems for my readers (joke).

396

397

predStatement : "#pred"

398

CapitalizedName

399

(

400

"<<predicate_literal>>?"

401

| "<<predicate_literal>>?" predOrExpr

402

| predOrExpr

403

)

404

;

405

406

predOrExpr : predAndExpr ( "||" predAndExpr ) * ;

407

408

predAndExpr : predPrimary ( "&&" predPrimary ) * ;

409

410

predPrimary : CapitalizedName

411

| "!" predPrimary

412

| "(" predOrExpr ")"

413

;

414

415

What is the purpose of this nonsense ?

416

417

To understand how predicate symbols help, you need to realize that

418

predicate symbols are used in two different ways with two different

419

goals.

420

421

a. Allow simplification of predicates which have been combined

422

during predicate hoisting.

423

424

b. Allow recognition of identical predicates which can't disambiguate

425

alternatives with common lookahead.

426

427

First we will discuss goal (a). Consider the following rule:

428

429

rule0: rule1

430

| ID

431

| ...

432

;

433

434

rule1: rule2

435

| rule3

436

;

437

438

rule2: <<isX(LATEXT(1))>>? ID ;

439

rule3: <<!isX(LATEXT(1)>>? ID ;

440

441

When the predicates in rule2 and rule3 are combined by hoisting

442

to create a prediction expression for rule1 the result is:

443

444

if ( LA(1)==ID

445

&& ( isX(LATEXT(1) || !isX(LATEXT(1) ) ) { rule1(); ...

446

447

This is inefficient, but more importantly, can lead to false

448

assumptions that the predicate expression distinguishes the rule1

449

alternative with some other alternative with lookahead ID. In

450

MR11 one can write:

451

452

#pred IsX <<isX(LATEXT(1))>>?

453

454

...

455

456

rule2: <<IsX>>? ID ;

457

rule3: <<!IsX>>? ID ;

458

459

During hoisting MR11 recognizes this as a special case and

460

eliminates the predicates. The result is a prediction

461

expression like the following:

462

463

if ( LA(1)==ID ) { rule1(); ...

464

465

Please note that the following cases which appear to be equivalent

466

*cannot* be simplified by MR11 during hoisting because the hoisting

467

logic only checks for a "!" in the predicate action, not in the

468

predicate expression for a predicate symbol.

469

470

*Not* equivalent and is not simplified during hoisting:

471

472

#pred IsX <<isX(LATEXT(1))>>?

473

#pred NotX <<!isX(LATEXT(1))>>?

474

...

475

rule2: <<IsX>>? ID ;

476

rule3: <<NotX>>? ID ;

477

478

*Not* equivalent and is not simplified during hoisting:

479

480

#pred IsX <<isX(LATEXT(1))>>?

481

#pred NotX !IsX

482

...

483

rule2: <<IsX>>? ID ;

484

rule3: <<NotX>>? ID ;

485

486

Now we will discuss goal (b).

487

488

When antlr discovers that there is a lookahead ambiguity between

489

two alternatives it attempts to resolve the ambiguity by searching

490

for predicates in both alternatives. In the past any predicate

491

would do, even if the same one appeared in both alternatives:

492

493

rule: <<p(LATEXT(1))>>? X

494

| <<p(LATEXT(1))>>? X

495

;

496

497

The #pred statement is a start towards solving this problem.

498

During ambiguity resolution (*not* predicate hoisting) the

499

predicates for the two alternatives are expanded and compared.

500

Consider the following example:

501

502

#pred Upper <<isUpper(LATEXT(1))>>?

503

#pred Lower <<isLower(LATEXT(1))>>?

504

#pred Alpha <<isAlpha(LATEXT(1))>>? Upper || Lower

505

506

rule0: rule1

507

| <<Alpha>>? ID

508

;

509

510

rule1:

511

| rule2

512

| rule3

513

...

514

;

515

516

rule2: <<Upper>>? ID;

517

rule3: <<Lower>>? ID;

518

519

The definition of #pred Alpha expresses:

520

521

a. to test the predicate use the C code "isAlpha(LATEXT(1))"

522

523

b. to analyze the predicate use the information that

524

Alpha is equivalent to the union of Upper and Lower,

525

526

During ambiguity resolution the definition of Alpha is expanded

527

into "Upper || Lower" and compared with the predicate in the other

528

alternative, which is also "Upper || Lower". Because they are

529

identical MR11 will report a problem.

530

531

-------------------------------------------------------------------------

532

t10.g, line 5: warning: the predicates used to disambiguate rule rule0

533

(file t10.g alt 1 line 5 and alt 2 line 6)

534

are identical when compared without context and may have no

535

resolving power for some lookahead sequences.

536

-------------------------------------------------------------------------

537

538

If you use the "-info p" option the output file will contain:

539

540

+----------------------------------------------------------------------+

541

|#if 0 |

542

| |

543

|The following predicates are identical when compared without |

544

| lookahead context information. For some ambiguous lookahead |

545

| sequences they may not have any power to resolve the ambiguity. |

546

| |

547

|Choice 1: rule0/1 alt 1 line 5 file t10.g |

548

| |

549

| The original predicate for choice 1 with available context |

550

| information: |

551

| |

552

| OR expr |

553

| |

554

| pred << Upper>>? |

555

| depth=k=1 rule rule2 line 14 t10.g |

556

| set context: |

557

| ID |

558

| |

559

| pred << Lower>>? |

560

| depth=k=1 rule rule3 line 15 t10.g |

561

| set context: |

562

| ID |

563

| |

564

| The predicate for choice 1 after expansion (but without context |

565

| information): |

566

| |

567

| OR expr |

568

| |

569

| pred << isUpper(LATEXT(1))>>? |

570

| depth=k=1 rule line 1 t10.g |

571

| |

572

| pred << isLower(LATEXT(1))>>? |

573

| depth=k=1 rule line 2 t10.g |

574

| |

575

| |

576

|Choice 2: rule0/2 alt 2 line 6 file t10.g |

577

| |

578

| The original predicate for choice 2 with available context |

579

| information: |

580

| |

581

| pred << Alpha>>? |

582

| depth=k=1 rule rule0 line 6 t10.g |

583

| set context: |

584

| ID |

585

| |

586

| The predicate for choice 2 after expansion (but without context |

587

| information): |

588

| |

589

| OR expr |

590

| |

591

| pred << isUpper(LATEXT(1))>>? |

592

| depth=k=1 rule line 1 t10.g |

593

| |

594

| pred << isLower(LATEXT(1))>>? |

595

| depth=k=1 rule line 2 t10.g |

596

| |

597

| |

598

|#endif |

599

+----------------------------------------------------------------------+

600

601

The comparison of the predicates for the two alternatives takes

602

place without context information, which means that in some cases

603

the predicates will be considered identical even though they operate

604

on disjoint lookahead sets. Consider:

605

606

#pred Alpha

607

608

rule1: <<Alpha>>? ID

609

| <<Alpha>>? Label

610

;

611

612

Because the comparison of predicates takes place without context

613

these will be considered identical. The reason for comparing

614

without context is that otherwise it would be necessary to re-evaluate

615

the entire predicate expression for each possible lookahead sequence.

616

This would require more code to be written and more CPU time during

617

grammar analysis, and it is not yet clear whether anyone will even make

618

use of the new #pred facility.

619

620

A temporary workaround might be to use different #pred statements

621

for predicates you know have different context. This would avoid

622

extraneous warnings.

623

624

The above example might be termed a "false positive". Comparison

625

without context will also lead to "false negatives". Consider the

626

following example:

627

628

#pred Alpha

629

#pred Beta

630

631

rule1: <<Alpha>>? A

632

| rule2

633

;

634

635

rule2: <<Alpha>>? A

636

| <<Beta>>? B

637

;

638

639

The predicate used for alt 2 of rule1 is (Alpha || Beta). This

640

appears to be different than the predicate Alpha used for alt1.

641

However, the context of Beta is B. Thus when the lookahead is A

642

Beta will have no resolving power and Alpha will be used for both

643

alternatives. Using the same predicate for both alternatives isn't

644

very helpful, but this will not be detected with 1.33MR11.

645

646

To properly handle this the predicate expression would have to be

647

evaluated for each distinct lookahead context.

648

649

To determine whether two predicate expressions are identical is

650

difficult. The routine may fail to identify identical predicates.

651

652

The #pred feature also compares predicates to see if a choice between

653

alternatives which is resolved by a predicate which makes the second

654

choice unreachable. Consider the following example:

655

656

#pred A <<A(LATEXT(1)>>?

657

#pred B <<B(LATEXT(1)>>?

658

#pred A_or_B A || B

659

660

r : s

661

| t

662

;

663

s : <<A_or_B>>? ID

664

;

665

t : <<A>>? ID

666

;

667

668

----------------------------------------------------------------------------

669

t11.g, line 5: warning: the predicate used to disambiguate the

670

first choice of rule r

671

(file t11.g alt 1 line 5 and alt 2 line 6)

672

appears to "cover" the second predicate when compared without context.

673

The second predicate may have no resolving power for some lookahead

674

sequences.

675

----------------------------------------------------------------------------

676

677

#139. (Changed in MR11) Problem with -gp in C++ mode

678

679

The -gp option to add a prefix to rule names did not work in

680

C++ mode. This has been fixed.

681

682

Reported by Alexey Demakov (demakov@kazbek.ispras.ru).

683

684

#138. (Changed in MR11) Additional makefiles for non-MSVC++ MS systems

685

686

Sramji Ramanathan (ps@kumaran.com) has supplied makefiles for

687

building antlr and dlg with Win95/NT development tools that

688

are not based on MSVC5. They are pccts/antlr/AntlrMS.mak and

689

pccts/dlg/DlgMS.mak.

690

691

The first line of the makefiles require a definition of PCCTS_HOME.

692

693

These are in additiion to the AntlrMSVC50.* and DlgMSVC50.*

694

supplied by Jeff Vincent (JVincent@novell.com).

695

696

#137. (Changed in MR11) Token getType(), getText(), getLine() const members

697

698

--------------------------------------------------------------------

699

If you use ANTLRCommonToken this change probably does not affect you.

700

--------------------------------------------------------------------

701

702

For a long time it has bothered me that these accessor functions

703

in ANTLRAbstractToken were not const member functions. I have

704

refrained from changing them because it require users to modify

705

existing token class definitions which are derived directly

706

from ANTLRAbstractToken. I think it is now time.

707

708

For those who are not used to C++, a "const member function" is a

709

member function which does not modify its own object - the thing

710

to which "this" points. This is quite different from a function

711

which does not modify its arguments

712

713

Most token definitions based on ANTLRAbstractToken have something like

714

the following in order to create concrete definitions of the pure

715

virtual methods in ANTLRAbstractToken:

716

717

class MyToken : public ANTLRAbstractToken {

718

...

719

ANTLRTokenType getType() {return _type; }

720

int getLine() {return _line; }

721

ANTLRChar * getText() {return _text; }

722

...

723

}

724

725

The required change is simply to put "const" following the function

726

prototype in the header (.h file) and the definition file (.cpp if

727

it is not inline):

728

729

class MyToken : public ANTLRAbstractToken {

730

...

731

ANTLRTokenType getType() const {return _type; }

732

int getLine() const {return _line; }

733

ANTLRChar * getText() const {return _text; }

734

...

735

}

736

737

This was originally proposed a long time ago by Bruce

738

Guenter (bruceg@qcc.sk.ca).

739

740

#136. (Changed in MR11) Added getLength() to ANTLRCommonToken

741

742

Classes ANTLRCommonToken and ANTLRCommonTokenNoRefCountToken

743

now have a member function:

744

745

int getLength() const { return strlen(getText()) }

746

747

Suggested by Sramji Ramanathan (ps@kumaran.com).

748

749

#135. (Changed in MR11) Raised antlr's own default ZZLEXBUFSIZE to 8k

750

751

#134a. (ansi_mr10.zip) T.J. Parr's ANSI C grammar made 1.33MR11 compatible

752

753

There is a typographical error in the definition of BITWISEOREQ:

754

755

#token BITWISEOREQ "!=" should be "\|="

756

757

When this change is combined with the bugfix to the follow set cache

758

problem (Item #147) and a minor rearrangement of the grammar

759

(Item #134b) it becomes a k=1 ck=2 grammar.

760

761

#134b. (ansi_mr10.zip) T.J. Parr's ANSI C grammar made 1.33MR11 compatible

762

763

The following changes were made in the ansi.g grammar (along with

764

using -mrhoist on):

765

766

ansi.g

767

======

768

void tracein(char *) ====> void tracein(const char *)

769

void traceout(char *) ====> void traceout(const char *)

770

771

<LT(1)->getType()==IDENTIFIER ? isTypeName(LT(1)->getText()) : 1>>?

772

====> <<isTypeName(LT(1)->getText())>>?

773

774

<<(LT(1)->getType()==LPARENTHESIS && LT(2)->getType()==IDENTIFIER) ? \

775

isTypeName(LT(2)->getText()) : 1>>?

776

====> (LPARENTHESIS IDENTIFIER)? => <<isTypeName(LT(2)->getText())>>?

777

778

<<(LT(1)->getType()==LPARENTHESIS && LT(2)->getType()==IDENTIFIER) ? \

779

isTypeName(LT(2)->getText()) : 1>>?

780

====> (LPARENTHESIS IDENTIFIER)? => <<isTypeName(LT(2)->getText())>>?

781

782

added to init(): traceOptionValueDefault=0;

783

added to init(): traceOption(-1);

784

785

change rule "statement":

786

787

statement

788

: plain_label_statement

789

| case_label_statement

790

| <<;>> expression SEMICOLON

791

| compound_statement

792

| selection_statement

793

| iteration_statement

794

| jump_statement

795

| SEMICOLON

796

;

797

798

plain_label_statement

799

: IDENTIFIER COLON statement

800

;

801

802

case_label_statement

803

: CASE constant_expression COLON statement

804

| DEFAULT COLON statement

805

;

806

807

support.cpp

808

===========

809

void tracein(char *) ====> void tracein(const char *)

810

void traceout(char *) ====> void traceout(const char *)

811

812

added to tracein(): ANTLRParser::tracein(r); // call superclass method

813

added to traceout(): ANTLRParser::traceout(r); // call superclass method

814

815

Makefile

816

========

817

added to AFLAGS: -mrhoist on -prc on

818

819

#133. (Changed in 1.33MR11) Make trace options public in ANTLRParser

820

821

In checking T.J. Parr's ANSI C grammar for compatibility with

822

1.33MR11 discovered that it was inconvenient to have the

823

trace facilities with protected access.

824

825

#132. (Changed in 1.33MR11) Recognition of identical predicates in alts

826

827

Prior to 1.33MR11, there would be no ambiguity warning when the

828

very same predicate was used to disambiguate both alternatives:

829

830

test: ref B

831

| ref C

832

;

833

834

ref : <<pred(LATEXT(1)>>? A

835

836

In 1.33MR11 this will cause the warning:

837

838

warning: the predicates used to disambiguate rule test

839

(file v98.g alt 1 line 1 and alt 2 line 2)

840

are identical and have no resolving power

841

842

----------------- Note -----------------

843

844

This is different than the following case

845

846

test: <<pred(LATEXT(1))>>? A B

847

| <<pred(LATEXT(1)>>? A C

848

;

849

850

In this case there are two distinct predicates

851

which have exactly the same text. In the first

852

example there are two references to the same

853

predicate. The problem represented by this

854

grammar will be addressed later.

855

856

#131. (Changed in 1.33MR11) Case insensitive command line options

857

858

Command line switches like "-CC" and keywords like "on", "off",

859

and "stdin" are no longer case sensitive in antlr, dlg, and sorcerer.

860

861

#130. (Changed in 1.33MR11) Changed ANTLR_VERSION to int from string

862

863

The ANTLR_VERSION was not an integer, making it difficult to

864

perform conditional compilation based on the antlr version.

865

866

Henceforth, ANTLR_VERSION will be:

867

868

(base_version * 10000) + release number

869

870

thus 1.33MR11 will be: 133*100+11 = 13311

871

872

Suggested by Rainer Janssen (Rainer.Janssen@Informatik.Uni-Oldenburg.DE).

873

874

#129. (Changed in 1.33MR11) Addition of ANTLR_VERSION to <parserName>.h

875

876

The following code is now inserted into <parserName>.h amd

877

stdpccts.h:

878

879

#ifndef ANTLR_VERSION

880

#define ANTLR_VERSION 13311

881

#endif

882

883

Suggested by Rainer Janssen (Rainer.Janssen@Informatik.Uni-Oldenburg.DE)

884

885

#128. (Changed in 1.33MR11) Redundant predicate code in (<<pred>>? ...)+

886

887

Prior to 1.33MR11, the following grammar would generate

888

redundant tests for the "while" condition.

889

890

rule2 : (<<pred>>? X)+ X

891

| B

892

;

893

894

The code would resemble:

895

896

if (LA(1)==X) {

897

if (pred) {

898

do {

899

if (!pred) {zzfailed_pred(" pred");}

900

zzmatch(X); zzCONSUME;

901

} while (LA(1)==X && pred && pred);

902

} else {...

903

904

With 1.33MR11 the redundant predicate test is omitted.

905

906

#127. (Changed in 1.33MR11)

907

908

Count Syntax Errors Count DLG Errors

909

------------------- ----------------

910

911

C++ mode ANTLRParser:: DLGLexerBase::

912

syntaxErrCount lexErrCount

913

C mode zzSyntaxErrCount zzLexErrCount

914

915

The C mode variables are global and initialized to 0.

916

They are *not* reset to 0 automatically when antlr is

917

restarted.

918

919

The C++ mode variables are public. They are initialized

920

to 0 by the constructors. They are *not* reset to 0 by the

921

ANTLRParser::init() method.

922

923

Suggested by Reinier van den Born (reinier@vnet.ibm.com).

924

925

#126. (Changed in 1.33MR11) Addition of #first <<...>>

926

927

The #first <<...>> inserts the specified text in the output

928

files before any other #include statements required by pccts.

929

The only things before the #first text are comments and

930

a #define ANTLR_VERSION.

931

932

Requested by and Esa Pulkkinen (esap@cs.tut.fi) and Alexin

933

Zoltan (alexin@inf.u-szeged.hu).

934

935

#125. (Changed in 1.33MR11) Lookahead for (guard)? && <>? predicates

936

937

When implementing the new style of guard predicate (Item #113)

938

in 1.33MR10 I decided to temporarily ignore the problem of

939

computing the "narrowest" lookahead context.

940

941

Consider the following k=1 grammar:

942

943

start : a

944

| b

945

;

946

947

a : (A)? && <<pred1(LATEXT(1))>>? ab ;

948

b : (B)? && <<pred2(LATEXT(1))>>? ab ;

949

950

ab : A | B ;

951

952

In MR10 the context for both "a" and "b" was {A B} because this is

953

the first set of rule "ab". Normally, this is not a problem because

954

the predicate which follows the guard inhibits any ambiguity report

955

by antlr.

956

957

In MR11 the first set for rule "a" is {A} and for rule "b" it is {B}.

958

959

#124. A Note on the New "&&" Style Guarded Predicates

960

961

I've been asked several times, "What is the difference between

962

the old "=>" style guard predicates and the new style "&&" guard

963

predicates, and how do you choose one over the other" ?

964

965

The main difference is that the "=>" does not apply the

966

predicate if the context guard doesn't match, whereas

967

the && form always does. What is the significance ?

968

969

If you have a predicate which is not on the "leading edge"

970

it is cannot be hoisted. Suppose you need a predicate that

971

looks at LA(2). You must introduce it manually. The

972

classic example is:

973

974

castExpr :

975

LP typeName RP

976

| ....

977

;

978

979

typeName : <<isTypeName(LATEXT(1))>>? ID

980

| STRUCT ID

981

;

982

983

The problem is that isTypeName() isn't on the leading edge

984

of typeName, so it won't be hoisted into castExpr to help

985

make a decision on which production to choose.

986

987

The *first* attempt to fix it is this:

988

989

castExpr :

990

<<isTypeName(LATEXT(2))>>?

991

LP typeName RP

992

| ....

993

;

994

995

Unfortunately, this won't work because it ignores

996

the problem of STRUCT. The solution is to apply

997

isTypeName() in castExpr if LA(2) is an ID and

998

don't apply it when LA(2) is STRUCT:

999

1000

castExpr :

1001

(LP ID)? => <<isTypeName(LATEXT(2))>>?

1002

LP typeName RP

1003

| ....

1004

;

1005

1006

In conclusion, the "=>" style guarded predicate is

1007

useful when:

1008

1009

a. the tokens required for the predicate

1010

are not on the leading edge

1011

b. there are alternatives in the expression

1012

selected by the predicate for which the

1013

predicate is inappropriate

1014

1015

If (b) were false, then one could use a simple

1016

predicate (assuming "-prc on"):

1017

1018

castExpr :

1019

<<isTypeName(LATEXT(2))>>?

1020

LP typeName RP

1021

| ....

1022

;

1023

1024

typeName : <<isTypeName(LATEXT(1))>>? ID

1025

;

1026

1027

So, when do you use the "&&" style guarded predicate ?

1028

1029

The new-style "&&" predicate should always be used with

1030

predicate context. The context guard is in ADDITION to

1031

the automatically computed context. Thus it useful for

1032

predicates which depend on the token type for reasons

1033

other than context.

1034

1035

The following example is contributed by Reinier van den Born

1036

(reinier@vnet.ibm.com).

1037

1038

+-------------------------------------------------------------------------+

1039

| This grammar has two ways to call functions: |

1040

| |

1041

| - a "standard" call syntax with parens and comma separated args |

1042

| - a shell command like syntax (no parens and spacing separated args) |

1043

| |

1044

| The former also allows a variable to hold the name of the function, |

1045

| the latter can also be used to call external commands. |

1046

| |

1047

| The grammar (simplified) looks like this: |

1048

| |

1049

| fun_call : ID "(" { expr ("," expr)* } ")" |

1050

| /* ID is function name */ |

1051

| | "@" ID "(" { expr ("," expr)* } ")" |

1052

| /* ID is var containing fun name */ |

1053

| ; |

1054

| |

1055

| command : ID expr* /* ID is function name */ |

1056

| | path expr* /* path is external command name */ |

1057

| ; |

1058

| |

1059

| path : ID /* left out slashes and such */ |

1060

| | "@" ID /* ID is environment var */ |

1061

| ; |

1062

| |

1063

| expr : .... |

1064

| | "(" expr ")"; |

1065

| |

1066

| call : fun_call |

1067

| | command |

1068

| ; |

1069

| |

1070

| Obviously the call is wildly ambiguous. This is more or less how this |

1071

| is to be resolved: |

1072

| |

1073

| A call begins with an ID or an @ followed by an ID. |

1074

| |

1075

| If it is an ID and if it is an ext. command name -> command |

1076

| if followed by a paren -> fun_call |

1077

| otherwise -> command |

1078

| |

1079

| If it is an @ and if the ID is a var name -> fun_call |

1080

| otherwise -> command |

1081

| |

1082

| One can implement these rules quite neatly using && predicates: |

1083

| |

1084

| call : ("@" ID)? && <<isVarName(LT(2))>>? fun_call |

1085

| | (ID)? && <<isExtCmdName>>? command |

1086

| | (ID "(")? fun_call |

1087

| | command |

1088

| ; |

1089

| |

1090

| This can be done better, so it is not an ideal example, but it |

1091

| conveys the principle. |

1092

+-------------------------------------------------------------------------+

1093

1094

#123. (Changed in 1.33MR11) Correct definition of operators in ATokPtr.h

1095

1096

The return value of operators in ANTLRTokenPtr:

1097

1098

changed: unsigned ... operator !=(...)

1099

to: int ... operator != (...)

1100

changed: unsigned ... operator ==(...)

1101

to: int ... operator == (...)

1102

1103

Suggested by R.A. Nelson (cowboy@VNET.IBM.COM)

1104

1105

#122. (Changed in 1.33MR11) Member functions to reset DLG in C++ mode

1106

1107

void DLGFileReset(FILE *f) { input = f; found_eof = 0; }

1108

void DLGStringReset(DLGChar *s) { input = s; p = &input[0]; }

1109

1110

Supplied by R.A. Nelson (cowboy@VNET.IBM.COM)

1111

1112

#121. (Changed in 1.33MR11) Another attempt to fix -o (output dir) option

1113

1114

Another attempt is made to improve the -o option of antlr, dlg,

1115

and sorcerer. This one by JVincent (JVincent@novell.com).

1116

1117

The current rule:

1118

1119

a. If -o is not specified than any explicit directory

1120

names are retained.

1121

1122

b. If -o is specified than the -o directory name overrides any

1123

explicit directory names.

1124

1125

c. The directory name of the grammar file is *not* stripped

1126

to create the main output file. However it is stil subject

1127

to override by the -o directory name.

1128

1129

#120. (Changed in 1.33MR11) "-info f" output to stdout rather than stderr

1130

1131

Added option 0 (e.g. "-info 0") which is a noop.

1132

1133

#119. (Changed in 1.33MR11) Ambiguity aid for grammars

1134

1135

The user can ask for additional information on ambiguities reported

1136

by antlr to stdout. At the moment, only one ambiguity report can

1137

be created in an antlr run.

1138

1139

This feature is enabled using the "-aa" (Ambiguity Aid) option.

1140

1141

The following options control the reporting of ambiguities:

1142

1143

-aa ruleName Selects reporting by name of rule

1144

-aa lineNumber Selects reporting by line number

1145

(file name not compared)

1146

1147

-aam Selects "multiple" reporting for a token

1148

in the intersection set of the

1149

alternatives.

1150

1151

For instance, the token ID may appear dozens

1152

of times in various paths as the program

1153

explores the rules which are reachable from

1154

the point of an ambiguity. With option -aam

1155

every possible path the search program

1156

encounters is reported.

1157

1158

Without -aam only the first encounter is

1159

reported. This may result in incomplete

1160

information, but the information may be

1161

sufficient and much shorter.

1162

1163

-aad depth Selects the depth of the search.

1164

The default value is 1.

1165

1166

The number of paths to be searched, and the

1167

size of the report can grow geometrically

1168

with the -ck value if a full search for all

1169

contributions to the source of the ambiguity

1170

is explored.

1171

1172

The depth represents the number of tokens

1173

in the lookahead set which are matched against

1174

the set of ambiguous tokens. A depth of 1

1175

means that the search stops when a lookahead

1176

sequence of just one token is matched.

1177

1178

A k=1 ck=6 grammar might generate 5,000 items

1179

in a report if a full depth 6 search is made

1180

with the Ambiguity Aid. The source of the

1181

problem may be in the first token and obscured

1182

by the volume of data - I hesitate to call

1183

it information.

1184

1185

When the user selects a depth > 1, the search

1186

is first performed at depth=1 for both

1187

alternatives, then depth=2 for both alternatives,

1188

etc.

1189

1190

Sample output for rule grammar in antlr.g itself:

1191

1192

+---------------------------------------------------------------------+

1193

| Ambiguity Aid |

1194

| |

1195

| Choice 1: grammar/70 line 632 file a.g |

1196

| Choice 2: grammar/82 line 644 file a.g |

1197

| |

1198

| Intersection of lookahead[1] sets: |

1199

| |

1200

| "\}" "class" "#errclass" "#tokclass" |

1201

| |

1202

| Choice:1 Depth:1 Group:1 ("#errclass") |

1203

| 1 in (...)* block grammar/70 line 632 a.g |

1204

| 2 to error grammar/73 line 635 a.g |

1205

| 3 error error/1 line 894 a.g |

1206

| 4 #token "#errclass" error/2 line 895 a.g |

1207

| |

1208

| Choice:1 Depth:1 Group:2 ("#tokclass") |

1209

| 2 to tclass grammar/74 line 636 a.g |

1210

| 3 tclass tclass/1 line 937 a.g |

1211

| 4 #token "#tokclass" tclass/2 line 938 a.g |

1212

| |

1213

| Choice:1 Depth:1 Group:3 ("class") |

1214

| 2 to class_def grammar/75 line 637 a.g |

1215

| 3 class_def class_def/1 line 669 a.g |

1216

| 4 #token "class" class_def/3 line 671 a.g |

1217

| |

1218

| Choice:1 Depth:1 Group:4 ("\}") |

1219

| 2 #token "\}" grammar/76 line 638 a.g |

1220

| |

1221

| Choice:2 Depth:1 Group:5 ("#errclass") |

1222

| 1 in (...)* block grammar/83 line 645 a.g |

1223

| 2 to error grammar/93 line 655 a.g |

1224

| 3 error error/1 line 894 a.g |

1225

| 4 #token "#errclass" error/2 line 895 a.g |

1226

| |

1227

| Choice:2 Depth:1 Group:6 ("#tokclass") |

1228

| 2 to tclass grammar/94 line 656 a.g |

1229

| 3 tclass tclass/1 line 937 a.g |

1230

| 4 #token "#tokclass" tclass/2 line 938 a.g |

1231

| |

1232

| Choice:2 Depth:1 Group:7 ("class") |

1233

| 2 to class_def grammar/95 line 657 a.g |

1234

| 3 class_def class_def/1 line 669 a.g |

1235

| 4 #token "class" class_def/3 line 671 a.g |

1236

| |

1237

| Choice:2 Depth:1 Group:8 ("\}") |

1238

| 2 #token "\}" grammar/96 line 658 a.g |

1239

+---------------------------------------------------------------------+

1240

1241

For a linear lookahead set ambiguity (where k=1 or for k>1 but

1242

when all lookahead sets [i] with i<k all have degree one) the

1243

reports appear in the following order:

1244

1245

for (depth=1 ; depth <= "-aad depth" ; depth++) {

1246

for (alternative=1; alternative <=2 ; alternative++) {

1247

while (matches-are-found) {

1248

group++;

1249

print-report

1250

};

1251

};

1252

};

1253

1254

For reporting a k-tuple ambiguity, the reports appear in the

1255

following order:

1256

1257

for (depth=1 ; depth <= "-aad depth" ; depth++) {

1258

while (matches-are-found) {

1259

for (alternative=1; alternative <=2 ; alternative++) {

1260

group++;

1261

print-report

1262

};

1263

};

1264

};

1265

1266

This is because matches are generated in different ways for

1267

linear lookahead and k-tuples.

1268

1269

#118. (Changed in 1.33MR11) DEC VMS makefile and VMS related changes

1270

1271

Revised makefiles for DEC/VMS operating system for antlr, dlg,

1272

and sorcerer.

1273

1274

Reduced names of routines with external linkage to less than 32

1275

characters to conform to DEC/VMS linker limitations.

1276

1277

Jean-Francois Pieronne discovered problems with dlg and antlr

1278

due to the VMS linker not being case sensitive for names with

1279

external linkage. In dlg the problem was with "className" and

1280

"ClassName". In antlr the problem was with "GenExprSets" and

1281

"genExprSets".

1282

1283

Added genmms, a version of genmk for the DEC/VMS version of make.

1284

The source is in directory pccts/support/DECmms.

1285

1286

All VMS contributions by Jean-Francois Pieronne (jfp@iname.com).

1287

1288

#117. (Changed in 1.33MR10) new EXPERIMENTAL predicate hoisting code

1289

1290

The hoisting of predicates into rules to create prediction

1291

expressions is a problem in antlr. Consider the following

1292

example (k=1 with -prc on):

1293

1294

start : (a)* "@" ;

1295

a : b | c ;

1296

b : <<isUpper(LATEXT(1))>>? A ;

1297

c : A ;

1298

1299

Prior to 1.33MR10 the code generated for "start" would resemble:

1300

1301

while {

1302

if (LA(1)==A &&

1303

(!LA(1)==A || isUpper())) {

1304

a();

1305

}

1306

};

1307

1308

This code is wrong because it makes rule "c" unreachable from

1309

"start". The essence of the problem is that antlr fails to

1310

recognize that there can be a valid alternative within "a" even

1311

when the predicate <<isUpper(LATEXT(1))>>? is false.

1312

1313

In 1.33MR10 with -mrhoist the hoisting of the predicate into

1314

"start" is suppressed because it recognizes that "c" can

1315

cover all the cases where the predicate is false:

1316

1317

while {

1318

if (LA(1)==A) {

1319

a();

1320

}

1321

};

1322

1323

With the antlr "-info p" switch the user will receive information

1324

about the predicate suppression in the generated file:

1325

1326

--------------------------------------------------------------

1327

#if 0

1328

1329

Hoisting of predicate suppressed by alternative without predicate.

1330

The alt without the predicate includes all cases where

1331

the predicate is false.

1332

1333

WITH predicate: line 7 v1.g

1334

WITHOUT predicate: line 7 v1.g

1335

1336

The context set for the predicate:

1337

1338

1339

1340

The lookahead set for the alt WITHOUT the semantic predicate:

1341

1342

1343

1344

The predicate:

1345

1346

pred << isUpper(LATEXT(1))>>?

1347

depth=k=1 rule b line 9 v1.g

1348

set context:

1349

1350

tree context: null

1351

1352

Chain of referenced rules:

1353

1354

#0 in rule start (line 5 v1.g) to rule a

1355

#1 in rule a (line 7 v1.g)

1356

1357

#endif

1358

--------------------------------------------------------------

1359

1360

A predicate can be suppressed by a combination of alternatives

1361

which, taken together, cover a predicate:

1362

1363

start : (a)* "@" ;

1364

1365

a : b | ca | cb | cc ;

1366

1367

b : <<isUpper(LATEXT(1))>>? ( A | B | C ) ;

1368

1369

ca : A ;

1370

cb : B ;

1371

cc : C ;

1372

1373

Consider a more complex example in which "c" covers only part of

1374

a predicate:

1375

1376

start : (a)* "@" ;

1377

1378

a : b

1379

| c

1380

;

1381

1382

b : <<isUpper(LATEXT(1))>>?

1383

( A

1384

| X

1385

);

1386

1387

c : A

1388

;

1389

1390

Prior to 1.33MR10 the code generated for "start" would resemble:

1391

1392

while {

1393

if ( (LA(1)==A || LA(1)==X) &&

1394

(! (LA(1)==A || LA(1)==X) || isUpper()) {

1395

a();

1396

}

1397

};

1398

1399

With 1.33MR10 and -mrhoist the predicate context is restricted to

1400

the non-covered lookahead. The code resembles:

1401

1402

while {

1403

if ( (LA(1)==A || LA(1)==B) &&

1404

(! (LA(1)==X) || isUpper()) {

1405

a();

1406

}

1407

};

1408

1409

With the antlr "-info p" switch the user will receive information

1410

about the predicate restriction in the generated file:

1411

1412

--------------------------------------------------------------

1413

#if 0

1414

1415

Restricting the context of a predicate because of overlap

1416

in the lookahead set between the alternative with the

1417

semantic predicate and one without

1418

Without this restriction the alternative without the predicate

1419

could not be reached when input matched the context of the

1420

predicate and the predicate was false.

1421

1422

WITH predicate: line 11 v4.g

1423

WITHOUT predicate: line 12 v4.g

1424

1425

The original context set for the predicate:

1426

1427

A X

1428

1429

The lookahead set for the alt WITHOUT the semantic predicate:

1430

1431

1432

1433

The intersection of the two sets

1434

1435

1436

1437

The original predicate:

1438

1439

pred << isUpper(LATEXT(1))>>?

1440

depth=k=1 rule b line 15 v4.g

1441

set context:

1442

A X

1443

tree context: null

1444

1445

The new (modified) form of the predicate:

1446

1447

pred << isUpper(LATEXT(1))>>?

1448

depth=k=1 rule b line 15 v4.g

1449

set context:

1450

1451

tree context: null

1452

1453

#endif

1454

--------------------------------------------------------------

1455

1456

The bad news about -mrhoist:

1457

1458

(a) -mrhoist does not analyze predicates with lookahead

1459

depth > 1.

1460

1461

(b) -mrhoist does not look past a guarded predicate to

1462

find context which might cover other predicates.

1463

1464

For these cases you might want to use syntactic predicates.

1465

When a semantic predicate fails during guess mode the guess

1466

fails and the next alternative is tried.

1467

1468

Limitation (a) is illustrated by the following example:

1469

1470

start : (stmt)* EOF ;

1471

1472

stmt : cast

1473

| expr

1474

;

1475

cast : <<isTypename(LATEXT(2))>>? LP ID RP ;

1476

1477

expr : LP ID RP ;

1478

1479

This is not much different from the first example, except that

1480

it requires two tokens of lookahead context to determine what

1481

to do. This predicate is NOT suppressed because the current version

1482

is unable to handle predicates with depth > 1.

1483

1484

A predicate can be combined with other predicates during hoisting.

1485

In those cases the depth=1 predicates are still handled. Thus,

1486

in the following example the isUpper() predicate will be suppressed

1487

by line #4 when hoisted from "bizarre" into "start", but will still

1488

be present in "bizarre" in order to predict "stmt".

1489

1490

start : (bizarre)* EOF ; // #1

1491

// #2

1492

bizarre : stmt // #3

1493

| A // #4

1494

;

1495

1496

stmt : cast

1497

| expr

1498

;

1499

1500

cast : <<isTypename(LATEXT(2))>>? LP ID RP ;

1501

1502

expr : LP ID RP ;

1503

| <<isUpper(LATEXT(1))>>? A

1504

1505

Limitation (b) is illustrated by the following example of a

1506

context guarded predicate:

1507

1508

rule : (A)? <>? // #1

1509

(A // #2

1510

|B // #3

1511

) // #4

1512

| <<q>> B // #5

1513

;

1514

1515

Recall that this means that when the lookahead is NOT A then

1516

the predicate "p" is ignored and it attempts to match "A|B".

1517

Ideally, the "B" at line #3 should suppress predicate "q".

1518

However, the current version does not attempt to look past

1519

the guard predicate to find context which might suppress other

1520

predicates.

1521

1522

In some cases -mrhoist will lead to the reporting of ambiguities

1523

which were not visible before:

1524

1525

start : (a)* "@";

1526

a : bc | d;

1527

bc : b | c ;

1528

1529

b : <<isUpper(LATEXT(1))>>? A;

1530

c : A ;

1531

1532

d : A ;

1533

1534

In this case there is a true ambiguity in "a" between "bc" and "d"

1535

which can both match "A". Without -mrhoist the predicate in "b"

1536

is hoisted into "a" and there is no ambiguity reported. However,

1537

with -mrhoist, the predicate in "b" is suppressed by "c" (as it

1538

should be) making the ambiguity in "a" apparent.

1539

1540

The motivations for these changes were hoisting problems reported

1541

by Reinier van den Born (reinier@vnet.ibm.com) and several others.

1542

1543

#116. (Changed in 1.33MR10) C++ mode: tracein/traceout rule name is (const char *)

1544

1545

The prototype for C++ mode routine tracein (and traceout) has changed from

1546

"char *" to "const char *".

1547

1548

#115. (Changed in 1.33MR10) Using guess mode with exception handlers in C mode

1549

1550

The definition of the C mode macros zzmatch_wsig and zzsetmatch_wsig

1551

neglected to consider guess mode. When control passed to the rule's

1552

parse exception handler the routine would exit without ever closing the

1553

guess block. This would lead to unpredictable behavior.

1554

1555

In 1.33MR10 the behavior of exceptions in C mode and C++ mode should be

1556

identical.

1557

1558

#114. (Changed in 1.33MR10) difference in [zz]resynch() between C and C++ modes

1559

1560

There was a slight difference in the way C and C++ mode resynchronized

1561

following a parsing error. The C routine would sometimes skip an extra

1562

token before attempting to resynchronize.

1563

1564

The C routine was changed to match the C++ routine.

1565

1566

#113. (Changed in 1.33MR10) new context guarded pred: (g)? && <>? expr

1567

1568

The existing context guarded predicate:

1569

1570

rule : (guard)? => <>? expr

1571

| next_alternative

1572

;

1573

1574

generates code which resembles:

1575

1576

if (lookahead(expr) && (!guard || pred)) {

1577

expr()

1578

} else ....

1579

1580

This is not suitable for some applications because it allows

1581

expr() to be invoked when the predicate is false. This is

1582

intentional because it is meant to mimic automatically computed

1583

predicate context.

1584

1585

The new context guarded predicate uses the guard information

1586

differently because it has a different goal. Consider:

1587

1588

rule : (guard)? && <>? expr

1589

| next_alternative

1590

;

1591

1592

The new style of context guarded predicate is equivalent to:

1593

1594

rule : <<guard==true && pred>>? expr

1595

| next_alternative

1596

;

1597

1598

It generates code which resembles:

1599

1600

if (lookahead(expr) && guard && pred) {

1601

expr();

1602

} else ...

1603

1604

Both forms of guarded predicates severely restrict the form of

1605

the context guard: it can contain no rule references, no

1606

(...)*, no (...)+, and no {...}. It may contain token and

1607

token class references, and alternation ("|").

1608

1609

Addition for 1.33MR11: in the token expression all tokens must

1610

be at the same height of the token tree:

1611

1612

(A ( B | C))? && ... is ok (all height 2)

1613

(A ( B | ))? && ... is not ok (some 1, some 2)

1614

(A B C D | E F G H)? && ... is ok (all height 4)

1615

(A B C D | E )? && ... is not ok (some 4, some 1)

1616

1617

This restriction is required in order to properly compute the lookahead

1618

set for expressions like:

1619

1620

rule1 : (A B C)? && <<pred>>? rule2 ;

1621

rule2 : (A|X) (B|Y) (C|Z);

1622

1623

This addition was suggested by Rienier van den Born (reinier@vnet.ibm.com)

1624

1625

#112. (Changed in 1.33MR10) failed validation predicate in C guess mode

1626

1627

John Lilley (jlilley@empathy.com) suggested that failed validation

1628

predicates abort a guess rather than reporting a failed error.

1629

This was installed in C++ mode (Item #4). Only now was it noticed

1630

that the fix was never installed for C mode.

1631

1632

#111. (Changed in 1.33MR10) moved zzTRACEIN to before init action

1633

1634

When the antlr -gd switch is present antlr generates calls to

1635

zzTRACEIN at the start of a rule and zzTRACEOUT at the exit

1636

from a rule. Prior to 1.33MR10 Tthe call to zzTRACEIN was

1637

after the init-action, which could cause confusion because the

1638

init-actions were reported with the name of the enclosing rule,

1639

rather than the active rule.

1640

1641

#110. (Changed in 1.33MR10) antlr command line copied to generated file

1642

1643

The antlr command line is now copied to the generated file near

1644

the start.

1645

1646

#109. (Changed in 1.33MR10) improved trace information

1647

1648

The quality of the trace information provided by the "-gd"

1649

switch has been improved significantly. Here is an example

1650

of the output from a test program. It shows the rule name,

1651

the first token of lookahead, the call depth, and the guess

1652

status:

1653

1654

exit rule gusxx {"?"} depth 2

1655

enter rule gusxx {"?"} depth 2

1656

enter rule gus1 {"o"} depth 3 guessing

1657

guess done - returning to rule gus1 {"o"} at depth 3

1658

(guess mode continues - an enclosing guess is still active)

1659

guess done - returning to rule gus1 {"Z"} at depth 3

1660

(guess mode continues - an enclosing guess is still active)

1661

exit rule gus1 {"Z"} depth 3 guessing

1662

guess done - returning to rule gusxx {"o"} at depth 2 (guess mode ends)

1663

enter rule gus1 {"o"} depth 3

1664

guess done - returning to rule gus1 {"o"} at depth 3 (guess mode ends)

1665

guess done - returning to rule gus1 {"Z"} at depth 3 (guess mode ends)

1666

exit rule gus1 {"Z"} depth 3

1667

line 1: syntax error at "Z" missing SC

1668

...

1669

1670

Rule trace reporting is controlled by the value of the integer

1671

[zz]traceOptionValue: when it is positive tracing is enabled,

1672

otherwise it is disabled. Tracing during guess mode is controlled

1673

by the value of the integer [zz]traceGuessOptionValue. When

1674

it is positive AND [zz]traceOptionValue is positive rule trace

1675

is reported in guess mode.

1676

1677

The values of [zz]traceOptionValue and [zz]traceGuessOptionValue

1678

can be adjusted by subroutine calls listed below.

1679

1680

Depending on the presence or absence of the antlr -gd switch

1681

the variable [zz]traceOptionValueDefault is set to 0 or 1. When

1682

the parser is initialized or [zz]traceReset() is called the

1683

value of [zz]traceOptionValueDefault is copied to [zz]traceOptionValue.

1684

The value of [zz]traceGuessOptionValue is always initialzed to 1,

1685

but, as noted earlier, nothing will be reported unless

1686

[zz]traceOptionValue is also positive.

1687

1688

When the parser state is saved/restored the value of the trace

1689

variables are also saved/restored. If a restore causes a change in

1690

reporting behavior from on to off or vice versa this will be reported.

1691

1692

When the -gd option is selected, the macro "#define zzTRACE_RULES"

1693

is added to appropriate output files.

1694

1695

C++ mode

1696

--------

1697

int traceOption(int delta)

1698

int traceGuessOption(int delta)

1699

void traceReset()

1700

int traceOptionValueDefault

1701

1702

C mode

1703

--------

1704

int zzTraceOption(int delta)

1705

int zzTraceGuessOption(int delta)

1706

void zzTraceReset()

1707

int zzTraceOptionValueDefault

1708

1709

The argument "delta" is added to the traceOptionValue. To

1710

turn on trace when inside a particular rule one:

1711

1712

rule : <<traceOption(+1);>>

1713

(

1714

rest-of-rule

1715

)

1716

<<traceOption(-1);>>

1717

; /* fail clause */ <<traceOption(-1);>>

1718

1719

One can use the same idea to turn *off* tracing within a

1720

rule by using a delta of (-1).

1721

1722

An improvement in the rule trace was suggested by Sramji

1723

Ramanathan (ps@kumaran.com).

1724

1725

#108. A Note on Deallocation of Variables Allocated in Guess Mode

1726

1727

NOTE

1728

------------------------------------------------------

1729

This mechanism only works for heap allocated variables

1730

------------------------------------------------------

1731

1732

The rewrite of the trace provides the machinery necessary

1733

to properly free variables or undo actions following a

1734

failed guess.

1735

1736

The macro zzUSER_GUESS_HOOK(guessSeq,zzrv) is expanded

1737

as part of the zzGUESS macro. When a guess is opened

1738

the value of zzrv is 0. When a longjmp() is executed to

1739

undo the guess, the value of zzrv will be 1.

1740

1741

The macro zzUSER_GUESS_DONE_HOOK(guessSeq) is expanded

1742

as part of the zzGUESS_DONE macro. This is executed

1743

whether the guess succeeds or fails as part of closing

1744

the guess.

1745

1746

The guessSeq is a sequence number which is assigned to each

1747

guess and is incremented by 1 for each guess which becomes

1748

active. It is needed by the user to associate the start of

1749

a guess with the failure and/or completion (closing) of a

1750

guess.

1751

1752

Guesses are nested. They must be closed in the reverse

1753

of the order that they are opened.

1754

1755

In order to free memory used by a variable during a guess

1756

a user must write a routine which can be called to

1757

1758

number provided by the zzUSER_GUESS_HOOK macro. If the guess

1759

fails, all variables tagged with the corresponding guess

1760

sequence number should be released. This is ugly, but

1761

it would require a major rewrite of antlr 1.33 to use

1762

some mechanism other than setjmp()/longjmp().

1763

1764

The order of calls for a *successful* guess would be:

1765

1766

zzUSER_GUESS_HOOK(guessSeq,0);

1767

zzUSER_GUESS_DONE_HOOK(guessSeq);

1768

1769

The order of calls for a *failed* guess would be:

1770

1771

zzUSER_GUESS_HOOK(guessSeq,0);

1772

zzUSER_GUESS_HOOK(guessSeq,1);

1773

zzUSER_GUESS_DONE_HOOK(guessSeq);

1774

1775

The default definitions of these macros are empty strings.

1776

1777

Here is an example in C++ mode. The zzUSER_GUESS_HOOK and

1778

zzUSER_GUESS_DONE_HOOK macros and myGuessHook() routine

1779

can be used without change in both C and C++ versions.

1780

1781

----------------------------------------------------------------------

1782

1783

1784

#include "AToken.h"

1785

1786

typedef ANTLRCommonToken ANTLRToken;

1787

1788

#include "DLGLexer.h"

1789

1790

int main() {

1791

1792

{

1793

DLGFileInput in(stdin);

1794

DLGLexer lexer(&in,2000);

1795

ANTLRTokenBuffer pipe(&lexer,1);

1796

ANTLRCommonToken aToken;

1797

P parser(&pipe);

1798

1799

lexer.setToken(&aToken);

1800

parser.init();

1801

parser.start();

1802

};

1803

1804

fclose(stdin);

1805

fclose(stdout);

1806

return 0;

1807

}

1808

1809

1810

1811

1812

char *s=NULL;

1813

1814

#undef zzUSER_GUESS_HOOK

1815

#define zzUSER_GUESS_HOOK(guessSeq,zzrv) myGuessHook(guessSeq,zzrv);

1816

#undef zzUSER_GUESS_DONE_HOOK

1817

#define zzUSER_GUESS_DONE_HOOK(guessSeq) myGuessHook(guessSeq,2);

1818

1819

void myGuessHook(int guessSeq,int zzrv) {

1820

if (zzrv == 0) {

1821

fprintf(stderr,"User hook: starting guess #%d\n",guessSeq);

1822

} else if (zzrv == 1) {

1823

free (s);

1824

s=NULL;

1825

fprintf(stderr,"User hook: failed guess #%d\n",guessSeq);

1826

} else if (zzrv == 2) {

1827

free (s);

1828

s=NULL;

1829

fprintf(stderr,"User hook: ending guess #%d\n",guessSeq);

1830

};

1831

}

1832

1833

1834

1835

#token A "a"

1836

#token "[\t \ \n]" <<skip();>>

1837

1838

class P {

1839

1840

start : (top)+

1841

;

1842

1843

top : (which) ? <<fprintf(stderr,"%s is a which\n",s); free(s); s=NULL; >>

1844

| other <<fprintf(stderr,"%s is an other\n",s); free(s); s=NULL; >>

1845

; <<if (s != NULL) free(s); s=NULL; >>

1846

1847

which : which2

1848

;

1849

1850

which2 : which3

1851

;

1852

which3

1853

: (label)? <<fprintf(stderr,"%s is a label\n",s);>>

1854

| (global)? <<fprintf(stderr,"%s is a global\n",s);>>

1855

| (exclamation)? <<fprintf(stderr,"%s is an exclamation\n",s);>>

1856

;

1857

1858

label : <<s=strdup(LT(1)->getText());>> A ":" ;

1859

1860

global : <<s=strdup(LT(1)->getText());>> A "::" ;

1861

1862

exclamation : <<s=strdup(LT(1)->getText());>> A "!" ;

1863

1864

other : <<s=strdup(LT(1)->getText());>> "other" ;

1865

1866

}

1867

----------------------------------------------------------------------

1868

1869

This is a silly example, but illustrates the idea. For the input

1870

"a ::" with tracing enabled the output begins:

1871

1872

----------------------------------------------------------------------

1873

enter rule "start" depth 1

1874

enter rule "top" depth 2

1875

User hook: starting guess #1

1876

enter rule "which" depth 3 guessing

1877

enter rule "which2" depth 4 guessing

1878

enter rule "which3" depth 5 guessing

1879

User hook: starting guess #2

1880

enter rule "label" depth 6 guessing

1881

guess failed

1882

User hook: failed guess #2

1883

guess done - returning to rule "which3" at depth 5 (guess mode continues

1884

- an enclosing guess is still active)

1885

User hook: ending guess #2

1886

User hook: starting guess #3

1887

enter rule "global" depth 6 guessing

1888

exit rule "global" depth 6 guessing

1889

guess done - returning to rule "which3" at depth 5 (guess mode continues

1890

- an enclosing guess is still active)

1891

User hook: ending guess #3

1892

enter rule "global" depth 6 guessing

1893

exit rule "global" depth 6 guessing

1894

exit rule "which3" depth 5 guessing

1895

exit rule "which2" depth 4 guessing

1896

exit rule "which" depth 3 guessing

1897

guess done - returning to rule "top" at depth 2 (guess mode ends)

1898

User hook: ending guess #1

1899

enter rule "which" depth 3

1900

.....

1901

----------------------------------------------------------------------

1902

1903

Remember:

1904

1905

(a) Only init-actions are executed during guess mode.

1906

(b) A rule can be invoked multiple times during guess mode.

1907

1908

without guess mode so that normal actions will be executed.

1909

This means that the init-action might need to distinguish

1910

between guess mode and non-guess mode using the variable

1911

[zz]guessing.

1912

1913

#107. (Changed in 1.33MR10) construction of ASTs in guess mode

1914

1915

Prior to 1.33MR10, when using automatic AST construction in C++

1916

mode for a rule, an AST would be constructed for elements of the

1917

rule even while in guess mode. In MR10 this no longer occurs.

1918

1919

#106. (Changed in 1.33MR10) guess variable confusion

1920

1921

In C++ mode a guess which failed always restored the parser state

1922

using zzGUESS_DONE as part of zzGUESS_FAIL. Prior to 1.33MR10,

1923

C mode required an explicit call to zzGUESS_DONE after the

1924

call to zzGUESS_FAIL.

1925

1926

Consider:

1927

1928

rule : (alpha)? beta

1929

| ...

1930

;

1931

1932

The generated code resembles:

1933

1934

zzGUESS

1935

if (!zzrv && LA(1)==ID) { <==== line #1

1936

alpha

1937

zzGUESS_DONE

1938

beta

1939

} else {

1940

if (! zzrv) zzGUESS_DONE <==== line #2a

1941

....

1942

1943

However, in some cases line #2 was rendered:

1944

1945

if (guessing) zzGUESS_DONE <==== line #2b

1946

1947

This would work for simple test cases, but would fail in

1948

some cases where there was a guess while another guess was active.

1949

One kind of failure would be to match up the zzGUESS_DONE at line

1950

#2b with the "outer" guess which was still active. The outer

1951

guess would "succeed" when only the inner guess should have

1952

succeeded.

1953

1954

In 1.33MR10 the behavior of zzGUESS and zzGUESS_FAIL in C and

1955

and C++ mode should be identical.

1956

1957

The same problem appears in 1.33 vanilla in some places. For

1958

example:

1959

1960

start : { (sub)? } ;

1961

1962

or:

1963

1964

start : (

1965

1966

| ( sub )?

1967

| C

1968

1969

;

1970

1971

generates incorrect code.

1972

1973

The general principle is:

1974

1975

(a) use [zz]guessing only when deciding between a call to zzFAIL

1976

or zzGUESS_FAIL

1977

1978

(b) use zzrv in all other cases

1979

1980

This problem was discovered while testing changes to item #105.

1981

I believe this is now fixed. My apologies.

1982

1983

#105. (Changed in 1.33MR10) guess block as single alt of (...)+

1984

1985

Prior to 1.33MR10 the following constructs:

1986

1987

rule_plus : (

1988

(sub)?

1989

1990

;

1991

1992

rule_star : (

1993

(sub)?

1994

1995

;

1996

1997

generated incorrect code for the guess block (which could result

1998

in runtime errors) because of an incorrect optimization of a

1999

block with only a single alternative.

2000

2001

The fix caused some changes to the fix described in Item #49

2002

because there are now three code generation sequences for (...)+

2003

blocks containing a guess block:

2004

2005

a. single alternative which is a guess block

2006

b. multiple alternatives in which the last is a guess block

2007

c. all other cases

2008

2009

Forms like "rule_star" can have unexpected behavior when there

2010

is a syntax error: if the subrule "sub" is not matched *exactly*

2011

then "rule_star" will consume no tokens.

2012

2013

Reported by Esa Pulkkinen (esap@cs.tut.fi).

2014

2015

#104. (Changed in 1.33MR10) -o option for dlg

2016

2017

There was problem with the code added by item #74 to handle the

2018

-o option of dlg. This should fix it.

2019

2020

#103. (Changed in 1.33MR10) ANDed semantic predicates

2021

2022

Rescinded.

2023

2024

The optimization was a mistake.

2025

The resulting problem is described in Item #150.

2026

2027

#102. (Changed in 1.33MR10) allow "class parser : .... {"

2028

2029

The syntax of the class statement ("class parser-name {")

2030

has been extended to allow for the specification of base

2031

classes. An arbirtrary number of tokens may now appear

2032

between the class name and the "{". They are output

2033

again when the class declaration is generated. For

2034

example:

2035

2036

class Parser : public MyBaseClassANTLRparser {

2037

2038

This was suggested by a user, but I don't have a record

2039

of who it was.

2040

2041

#101. (Changed in 1.33MR10) antlr -info command line switch

2042

2043

-info

2044

2045

p - extra predicate information in generated file

2046

2047

t - information about tnode use:

2048

at the end of each rule in generated file

2049

summary on stderr at end of program

2050

2051

m - monitor progress

2052

prints name of each rule as it is started

2053

flushes output at start of each rule

2054

2055

f - first/follow set information to stdout

2056

2057

0 - no operation (added in 1.33MR11)

2058

2059

The options may be combined and may appear in any order.

2060

For example:

2061

2062

antlr -info ptm -CC -gt -mrhoist on mygrammar.g

2063

2064

#100a. (Changed in 1.33MR10) Predicate tree simplification

2065

2066

When the same predicates can be referenced in more than one

2067

alternative of a block large predicate trees can be formed.

2068

2069

The difference that these optimizations make is so dramatic

2070

that I have decided to use it even when -mrhoist is not selected.

2071

2072

Consider the following grammar:

2073

2074

start : ( all )* ;

2075

2076

all : a

2077

| d

2078

| e

2079

| f

2080

;

2081

2082

a : c A B

2083

| c A C

2084

;

2085

2086

c : <<AAA(LATEXT(2))>>?

2087

;

2088

2089

d : <<BBB(LATEXT(2))>>? B C

2090

;

2091

2092

e : <<CCC(LATEXT(2))>>? B C

2093

;

2094

2095

f : e X Y

2096

;

2097

2098

In rule "a" there is a reference to rule "c" in both alternatives.

2099

The length of the predicate AAA is k=2 and it can be followed in

2100

alternative 1 only by (A B) while in alternative 2 it can be

2101

followed only by (A C). Thus they do not have identical context.

2102

2103

In rule "all" the alternatives which refer to rules "e" and "f" allow

2104

elimination of the duplicate reference to predicate CCC.

2105

2106

The table below summarized the kind of simplification performed by

2107

1.33MR10. In the table, X and Y stand for single predicates

2108

(not trees).

2109

2110

(OR X (OR Y (OR Z))) => (OR X Y Z)

2111

(AND X (AND Y (AND Z))) => (AND X Y Z)

2112

2113

(OR X (... (OR X Y) ... )) => (OR X (... Y ... ))

2114

(AND X (... (AND X Y) ... )) => (AND X (... Y ... ))

2115

(OR X (... (AND X Y) ... )) => (OR X (... ... ))

2116

(AND X (... (OR X Y) ... )) => (AND X (... ... ))

2117

2118

(AND X) => X

2119

(OR X) => X

2120

2121

In a test with a complex grammar for a real application, a predicate

2122

tree with six OR nodes and 12 leaves was reduced to "(OR X Y Z)".

2123

2124

In 1.33MR10 there is a greater effort to release memory used

2125

by predicates once they are no longer in use.

2126

2127

#100b. (Changed in 1.33MR10) Suppression of extra predicate tests

2128

2129

The following optimizations require that -mrhoist be selected.

2130

2131

It is relatively easy to optimize the code generated for predicate

2132

gates when they are of the form:

2133

2134

(AND X Y Z ...)

2135

or (OR X Y Z ...)

2136

2137

where X, Y, Z, and "..." represent individual predicates (leaves) not

2138

predicate trees.

2139

2140

If the predicate is an AND the contexts of the X, Y, Z, etc. are

2141

ANDed together to create a single Tree context for the group and

2142

context tests for the individual predicates are suppressed:

2143

2144

--------------------------------------------------

2145

Note: This was incorrect. The contexts should be

2146

ORed together. This has been fixed. A more

2147

complete description is available in item #152.

2148

---------------------------------------------------

2149

2150

Optimization 1: (AND X Y Z ...)

2151

2152

Suppose the context for Xtest is LA(1)==LP and the context for

2153

Ytest is LA(1)==LP && LA(2)==ID.

2154

2155

Without the optimization the code would resemble:

2156

2157

if (lookaheadContext &&

2158

!(LA(1)==LP && LA(1)==LP && LA(2)==ID) ||

2159

( (! LA(1)==LP || Xtest) &&

2160

(! (LA(1)==LP || LA(2)==ID) || Xtest)

2161

)) {...

2162

2163

With the -mrhoist optimization the code would resemble:

2164

2165

if (lookaheadContext &&

2166

! (LA(1)==LP && LA(2)==ID) || (Xtest && Ytest) {...

2167

2168

Optimization 2: (OR X Y Z ...) with identical contexts

2169

2170

Suppose the context for Xtest is LA(1)==ID and for Ytest

2171

the context is also LA(1)==ID.

2172

2173

Without the optimization the code would resemble:

2174

2175

if (lookaheadContext &&

2176

! (LA(1)==ID || LA(1)==ID) ||

2177

(LA(1)==ID && Xtest) ||

2178

(LA(1)==ID && Ytest) {...

2179

2180

With the -mrhoist optimization the code would resemble:

2181

2182

if (lookaheadContext &&

2183

(! LA(1)==ID) || (Xtest || Ytest) {...

2184

2185

Optimization 3: (OR X Y Z ...) with distinct contexts

2186

2187

Suppose the context for Xtest is LA(1)==ID and for Ytest

2188

the context is LA(1)==LP.

2189

2190

Without the optimization the code would resemble:

2191

2192

if (lookaheadContext &&

2193

! (LA(1)==ID || LA(1)==LP) ||

2194

(LA(1)==ID && Xtest) ||

2195

(LA(1)==LP && Ytest) {...

2196

2197

With the -mrhoist optimization the code would resemble:

2198

2199

if (lookaheadContext &&

2200

(zzpf=0,

2201

(LA(1)==ID && (zzpf=1) && Xtest) ||

2202

(LA(1)==LP && (zzpf=1) && Ytest) ||

2203

!zzpf) {

2204

2205

These may appear to be of similar complexity at first,

2206

but the non-optimized version contains two tests of each

2207

context while the optimized version contains only one

2208

such test, as well as eliminating some of the inverted

2209

logic (" !(...) || ").

2210

2211

Optimization 4: Computation of predicate gate trees

2212

2213

When generating code for the gates of predicate expressions

2214

antlr 1.33 vanilla uses a recursive procedure to generate

2215

"&&" and "||" expressions for testing the lookahead. As each

2216

layer of the predicate tree is exposed a new set of "&&" and

2217

"||" expressions on the lookahead are generated. In many

2218

cases the lookahead being tested has already been tested.

2219

2220

With -mrhoist a lookahead tree is computed for the entire

2221

lookahead expression. This means that predicates with identical

2222

context or context which is a subset of another predicate's

2223

context disappear.

2224

2225

This is especially important for predicates formed by rules

2226

like the following:

2227

2228

uppperCaseVowel : <<isUpperCase(LATEXT(1))>>? vowel;

2229

vowel: : <<isVowel(LATEXT(1))>>? LETTERS;

2230

2231

These predicates are combined using AND since both must be

2232

satisfied for rule upperCaseVowel. They have identical

2233

context which makes this optimization very effective.

2234

2235

The affect of Items #100a and #100b together can be dramatic. In

2236

a very large (but real world) grammar one particular predicate

2237

expression was reduced from an (unreadable) 50 predicate leaves,

2238

195 LA(1) terms, and 5500 characters to an (easily comprehensible)

2239

3 predicate leaves (all different) and a *single* LA(1) term.

2240

2241

#99. (Changed in 1.33MR10) Code generation for expression trees

2242

2243

Expression trees are used for k>1 grammars and predicates with

2244

lookahead depth >1. This optimization must be enabled using

2245

"-mrhost on". (Clarification added for 1.33MR11).

2246

2247

In the processing of expression trees, antlr can generate long chains

2248

of token comparisons. Prior to 1.33MR10 there were many redundant

2249

parenthesis which caused problems for compilers which could handle

2250

expressions of only limited complexity. For example, to test an

2251

expression tree (root R A B C D), antlr would generate something

2252

resembling:

2253

2254

(LA(1)==R && (LA(2)==A || (LA(2)==B || (LA(2)==C || LA(2)==D)))))

2255

2256

If there were twenty tokens to test then there would be twenty

2257

parenthesis at the end of the expression.

2258

2259

In 1.33MR10 the generated code for tree expressions resembles:

2260

2261

(LA(1)==R && (LA(2)==A || LA(2)==B || LA(2)==C || LA(2)==D))

2262

2263

For "complex" expressions the output is indented to reflect the LA

2264

number being tested:

2265

2266

(LA(1)==R

2267

&& (LA(2)==A || LA(2)==B || LA(2)==C || LA(2)==D

2268

|| LA(2)==E || LA(2)==F)

2269

|| LA(1)==S

2270

&& (LA(2)==G || LA(2)==H))

2271

2272

2273

Suggested by S. Bochnak (S.Bochnak@@microTool.com.pl),

2274

2275

#98. (Changed in 1.33MR10) Option "-info p"

2276

2277

When the user selects option "-info p" the program will generate

2278

detailed information about predicates. If the user selects

2279

"-mrhoist on" additional detail will be provided explaining

2280

the promotion and suppression of predicates. The output is part

2281

of the generated file and sandwiched between #if 0/#endif statements.

2282

2283

Consider the following k=1 grammar:

2284

2285

start : ( all ) * ;

2286

2287

all : ( a

2288

| b

2289

)

2290

;

2291

2292

a : c B

2293

;

2294

2295

c : <<LATEXT(1)>>?

2296

| B

2297

;

2298

2299

b : <<LATEXT(1)>>? X

2300

;

2301

2302

Below is an excerpt of the output for rule "start" for the three

2303

predicate options (off, on, and maintenance release style hoisting).

2304

2305

For those who do not wish to use the "-mrhost on" option for code

2306

generation the option can be used in a "diagnostic" mode to provide

2307

valuable information:

2308

2309

a. where one should insert null actions to inhibit hoisting

2310

b. a chain of rule references which shows where predicates are

2311

being hoisted

2312

2313

======================================================================

2314

Example of "-info p" with "-mrhoist on"

2315

======================================================================

2316

#if 0

2317

2318

Hoisting of predicate suppressed by alternative without predicate.

2319

The alt without the predicate includes all cases where the

2320

predicate is false.

2321

2322

WITH predicate: line 11 v36.g

2323

WITHOUT predicate: line 12 v36.g

2324

2325

The context set for the predicate:

2326

2327

2328

2329

The lookahead set for alt WITHOUT the semantic predicate:

2330

2331

2332

2333

The predicate:

2334

2335

pred << LATEXT(1)>>? depth=k=1 rule c line 11 v36.g

2336

2337

set context:

2338

2339

tree context: null

2340

2341

Chain of referenced rules:

2342

2343

#0 in rule start (line 1 v36.g) to rule all

2344

#1 in rule all (line 3 v36.g) to rule a

2345

#2 in rule a (line 8 v36.g) to rule c

2346

#3 in rule c (line 11 v36.g)

2347

2348

#endif

2349

2350

#if 0

2351

2352

pred << LATEXT(1)>>? depth=k=1 rule b line 15 v36.g

2353

2354

set context:

2355

2356

tree context: null

2357

2358

#endif

2359

======================================================================

2360

Example of "-info p" with the default -prc setting ( "-prc off")

2361

======================================================================

2362

#if 0

2363

2364

2365

pred << LATEXT(1)>>? depth=k=1 rule c line 11 v36.g

2366

2367

set context:

2368

nil

2369

tree context: null

2370

2371

pred << LATEXT(1)>>? depth=k=1 rule b line 15 v36.g

2372

2373

set context:

2374

nil

2375

tree context: null

2376

2377

#endif

2378

======================================================================

2379

Example of "-info p" with "-prc on" and "-mrhoist off"

2380

======================================================================

2381

#if 0

2382

2383

2384

pred << LATEXT(1)>>? depth=k=1 rule c line 11 v36.g

2385

2386

set context:

2387

2388

tree context: null

2389

2390

pred << LATEXT(1)>>? depth=k=1 rule b line 15 v36.g

2391

2392

set context:

2393

2394

tree context: null

2395

2396

#endif

2397

======================================================================

2398

2399

#97. (Fixed in 1.33MR10) "Predicate applied for more than one ... "

2400

2401

In 1.33 vanilla, the grammar listed below produced this message for

2402

the first alternative (only) of rule "b":

2403

2404

warning: predicate applied for >1 lookahead 1-sequences

2405

[you may only want one lookahead 1-sequence to apply.

2406

Try using a context guard '(...)? =>'

2407

2408

In 1.33MR10 the message is issued for both alternatives.

2409

2410

top : (a)*;

2411

a : b | c ;

2412

2413

b : <<PPP(LATEXT(1))>>? ( AAA | BBB )

2414

| <<QQQ(LATEXT(1))>>? ( XXX | YYY )

2415

;

2416

2417

c : AAA | XXX;

2418

2419

#96. (Fixed in 1.33MR10) Guard predicates ignored when -prc off

2420

2421

Prior to 1.33MR10, guard predicate code was not generated unless

2422

"-prc on" was selected.

2423

2424

This was incorrect, since "-prc off" (the default) is supposed to

2425

disable only AUTOMATIC computation of predicate context, not the

2426

programmer specified context supplied by guard predicates.

2427

2428

#95. (Fixed in 1.33MR10) Predicate guard context length was k, not max(k,ck)

2429

2430

Prior to 1.33MR10, predicate guards were computed to k tokens rather

2431

than max(k,ck). Consider the following grammar:

2432

2433

a : ( A B C)? => <<AAA(LATEXT(1))>>? (A|X) (B|Y) (C|Z) ;

2434

2435

The code generated by 1.33 vanilla with "-k 1 -ck 3 -prc on"

2436

for the predicate in "a" resembles:

2437

2438

if ( (! LA(1)==A) || AAA(LATEXT(1))) {...

2439

2440

With 1.33MR10 and the same options the code resembles:

2441

2442

if ( (! (LA(1)==A && LA(2)==B && LA(3)==C) || AAA(LATEXT(1))) {...

2443

2444

#94. (Fixed in 1.33MR10) Predicates followed by rule references

2445

2446

Prior to 1.33MR10, a semantic predicate which referenced a token

2447

which was off the end of the rule caused an incomplete context

2448

to be computed (with "-prc on") for the predicate under some circum-

2449

stances. In some cases this manifested itself as illegal C code

2450

(e.g. "LA(2)==[Ep](1)" in the k=2 examples below:

2451

2452

all : ( a ) *;

2453

2454

a : <<AAA(LATEXT(2))>>? ID X

2455

| <<BBB(LATEXT(2))>>? Y

2456

| Z

2457

;

2458

2459

This might also occur when the semantic predicate was followed

2460

by a rule reference which was shorter than the length of the

2461

semantic predicate:

2462

2463

all : ( a ) *;

2464

2465

a : <<AAA(LATEXT(2))>>? ID X

2466

| <<BBB(LATEXT(2))>>? y

2467

| Z

2468

;

2469

2470

y : Y ;

2471

2472

Depending on circumstance, the resulting context might be too

2473

generous because it was too short, or too restrictive because

2474

of missing alternatives.

2475

2476

#93. (Changed in 1.33MR10) Definition of Purify macro

2477

2478

Ofer Ben-Ami (gremlin@cs.huji.ac.il) has supplied a definition

2479

for the Purify macro:

2480

2481

#define PURIFY(r, s) memset((char *) &(r), '\0', (s));

2482

2483

Note: This may not be the right thing to do for C++ objects that

2484

have constructors. Reported by Bonny Rais (bonny@werple.net.au).

2485

2486

For those cases one should #define PURIFY to an empty macro in the

2487

#header or #first actions.

2488

2489

#92. (Fixed in 1.33MR10) Guarded predicates and hoisting

2490

2491

When a guarded predicate participates in hoisting it is linked into

2492

a predicate expression tree. Prior to 1.33MR10 this link was never

2493

cleared and the next time the guard was used to construct a new

2494

tree the link could contain a spurious reference to another element

2495

which had previosly been joined to it in the semantic predicate tree.

2496

2497

For example:

2498

2499

start : ( all ) *;

2500

all : ( a | b ) ;

2501

2502

start2 : ( all2 ) *;

2503

all2 : ( a ) ;

2504

2505

a : (A)? => <<AAA(LATEXT(1))>>? A ;

2506

b : (B)? => <<BBB(LATEXT(1))>>? B ;

2507

2508

Prior to 1.33MR10 the code for "start2" would include a spurious

2509

reference to the BBB predicate which was left from constructing

2510

the predicate tree for rule "start" (i.e. or(AAA,BBB) ).

2511

2512

In 1.33MR10 this problem is avoided by cloning the original guard

2513

each time it is linked into a predicate tree.

2514

2515

#91. (Changed in 1.33MR10) Extensive changes to semantic pred hoisting

2516

2517

============================================

2518

This has been rendered obsolete by Item #117

2519

============================================

2520

2521

#90. (Fixed in 1.33MR10) Semantic pred with LT(i) and i>max(k,ck)

2522

2523

There is a bug in antlr 1.33 vanilla and all maintenance releases

2524

prior to 1.33MR10 which allows semantic predicates to reference

2525

an LT(i) or LATEXT(i) where i is larger than max(k,ck). When

2526

this occurs antlr will attempt to mark the ith element of an array

2527

in which there are only max(k,ck) elements. The result cannot

2528

be predicted.

2529

2530

Using LT(i) or LATEXT(i) for i>max(k,ck) is reported as an error

2531

in 1.33MR10.

2532

2533

#89. Rescinded

2534

2535

#88. (Fixed in 1.33MR10) Tokens used in semantic predicates in guess mode

2536

2537

Consider the behavior of a semantic predicate during guess mode:

2538

2539

rule : a:A (

2540

<<test($a)>>? b:B

2541

| c:C

2542

);

2543

2544

Prior to MR10 the assignment of the token or attribute to

2545

$a did not occur during guess mode, which would cause the

2546

semantic predicate to misbehave because $a would be null.

2547

2548

In 1.33MR10 a semantic predicate with a reference to an

2549

element label (such as $a) forces the assignment to take

2550

place even in guess mode.

2551

2552

In order to work, this fix REQUIRES use of the $label format

2553

for token pointers and attributes referenced in semantic

2554

predicates.

2555

2556

The fix does not apply to semantic predicates using the

2557

numeric form to refer to attributes (e.g. <<test($1)>>?).

2558

The user will receive a warning for this case.

2559

2560

Reported by Rob Trout (trout@mcs.cs.kent.edu).

2561

2562

#87. (Fixed in 1.33MR10) Malformed guard predicates

2563

2564

Context guard predicates may contain only references to

2565

tokens. They may not contain references to (...)+ and

2566

(...)* blocks. This is now checked. This replaces the

2567

fatal error message in item #78 with an appropriate

2568

(non-fatal) error messge.

2569

2570

In theory, context guards should be allowed to reference

2571

rules. However, I have not had time to fix this.

2572

Evaluation of the guard takes place before all rules have

2573

been read, making it difficult to resolve a forward reference

2574

to rule "zzz" - it hasn't been read yet ! To postpone evaluation

2575

of the guard until all rules have been read is too much

2576

for the moment.

2577

2578

#86. (Fixed in 1.33MR10) Unequal set size in set_sub

2579

2580

Routine set_sub() in pccts/support/set/set.h did not work

2581

correctly when the sets were of unequal sizes. Rewrote

2582

set_equ to make it simpler and remove unnecessary and

2583

expensive calls to set_deg(). This routine was not used

2584

in 1.33 vanila.

2585

2586

#85. (Changed in 1.33MR10) Allow redefinition of MaxNumFiles

2587

2588

Raised the maximum number of input files to 99 from 20.

2589

Put a #ifndef/#endif around the "#define MaxNumFiles 99".

2590

2591

#84. (Fixed in 1.33MR10) Initialize zzBadTok in macro zzRULE

2592

2593

Initialize zzBadTok to NULL in zzRULE macro of AParser.h.

2594

in order to get rid of warning messages.

2595

2596

#83. (Fixed in 1.33MR10) False warnings with -w2 for #tokclass

2597

2598

When -w2 is selected antlr gives inappropriate warnings about

2599

#tokclass names not having any associated regular expressions.

2600

Since a #tokclass is not a "real" token it will never have an

2601

associated regular expression and there should be no warning.

2602

2603

Reported by Derek Pappas (derek.pappas@eng.sun.com)

2604

2605

#82. (Fixed in 1.33MR10) Computation of follow sets with multiple cycles

2606

2607

Reinier van den Born (reinier@vnet.ibm.com) reported a problem

2608

in the computation of follow sets by antlr. The problem (bug)

2609

exists in 1.33 vanilla and all maintenance releases prior to 1.33MR10.

2610

2611

The problem involves the computation of follow sets when there are

2612

cycles - rules which have mutual references. I believe the problem

2613

is restricted to cases where there is more than one cycle AND

2614

elements of those cycles have rules in common. Even when this

2615

occurs it may not affect the code generated - but it might. It

2616

might also lead to undetected ambiguities.

2617

2618

There were no changes in antlr or dlg output from the revised version.

2619

2620

The following fragment demonstates the problem by giving different

2621

follow sets (option -pa) for var_access when built with k=1 and ck=2 on

2622

1.33 vanilla and 1.33MR10:

2623

2624

echo_statement : ECHO ( echo_expr )*

2625

;

2626

2627

echo_expr : ( command )?

2628

| expression

2629

;

2630

2631

command : IDENTIFIER

2632

{ concat }

2633

;

2634

2635

expression : operand ( OPERATOR operand )*

2636

;

2637

2638

operand : value

2639

| START command END

2640

;

2641

2642

value : concat

2643

| TYPE operand

2644

;

2645

2646

concat : var_access { CONCAT value }

2647

;

2648

2649

var_access : IDENTIFIER { INDEX }

2650

2651

;

2652

#81. (Changed in 1.33MR10) C mode use of attributes and ASTs

2653

2654

Reported by Isaac Clark (irclark@mindspring.com).

2655

2656

C mode code ignores attributes returned by rules which are

2657

referenced using element labels when ASTs are enabled (-gt option).

2658

2659

1. start : r:rule t:Token <<$start=$r;>>

2660

2661

The $r refrence will not work when combined with

2662

the -gt option.

2663

2664

2. start : t:Token <<$start=$t;>>

2665

2666

The $t reference works in all cases.

2667

2668

3. start : rule <<$0=$1;>>

2669

2670

Numeric labels work in all cases.

2671

2672

With MR10 the user will receive an error message for case 1 when

2673

the -gt option is used.

2674

2675

#80. (Fixed in 1.33MR10) (...)? as last alternative of block

2676

2677

A construct like the following:

2678

2679

rule : a

2680

| (b)?

2681

;

2682

2683

does not make sense because there is no alternative when

2684

the guess block fails. This is now reported as a warning

2685

to the user.

2686

2687

Previously, there was a code generation error for this case:

2688

the guess block was not "closed" when the guess failed.

2689

This could cause an infinite loop or other problems. This

2690

is now fixed.

2691

2692

Example problem:

2693

2694

#header<<

2695

#include <stdio.h>

2696

#include "charptr.h"

2697

2698

2699

2700

#include "charptr.c"

2701

main ()

2702

{

2703

ANTLR(start(),stdin);

2704

}

2705

2706

2707

#token "[\ \t]+" << zzskip(); >>

2708

#token "[\n]" << zzline++; zzskip(); >>

2709

2710

#token Word "[a-z]+"

2711

#token Number "[0-9]+"

2712

2713

2714

start : (test1)?

2715

| (test2)?

2716

;

2717

test1 : (Word Word Word Word)?

2718

| (Word Word Word Number)?

2719

;

2720

test2 : (Word Word Number Word)?

2721

| (Word Word Number Number)?

2722

;

2723

2724

Test data which caused infinite loop:

2725

2726

a 1 a a

2727

2728

#79. (Changed in 1.33MR10) Use of -fh with multiple parsers

2729

2730

Previously, antlr always used the pre-processor symbol

2731

STDPCCTS_H as a gate for the file stdpccts.h. This

2732

caused problems when there were multiple parsers defined

2733

because they used the same gate symbol.

2734

2735

In 1.33MR10, the -fh filename is used to generate the

2736

gate file for stdpccts.h. For instance:

2737

2738

antlr -fh std_parser1.h

2739

2740

generates the pre-processor symbol "STDPCCTS_std_parser1_H".

2741

2742

Reported by Ramanathan Santhanam (ps@kumaran.com).

2743

2744

#78. (Changed in 1.33MR9) Guard predicates that refer to rules

2745

2746

------------------------

2747

Please refer to Item #87

2748

------------------------

2749

2750

Guard predicates are processed during an early phase

2751

of antlr (during parsing) before all data structures

2752

are completed.

2753

2754

There is an apparent bug in earlier versions of 1.33

2755

which caused guard predicates which contained references

2756

to rules (rather than tokens) to reference a structure

2757

which hadn't yet been initialized.

2758

2759

In some cases (perhaps all cases) references to rules

2760

in guard predicates resulted in the use of "garbage".

2761

2762

#79. (Changed in 1.33MR9) Jeff Vincent (JVincent@novell.com)

2763

2764

Previously, the maximum length file name was set

2765

arbitrarily to 300 characters in antlr, dlg, and sorcerer.

2766

2767

The config.h file now attempts to define the maximum length

2768

filename using _MAX_PATH from stdlib.h before falling back

2769

to using the value 300.

2770

2771

#78. (Changed in 1.33MR9) Jeff Vincent (JVincent@novell.com)

2772

2773

Put #ifndef/#endif around definition of ZZLEXBUFSIZE in

2774

antlr.

2775

2776

#77. (Changed in 1.33MR9) Arithmetic overflow for very large grammars

2777

2778

In routine HandleAmbiguities() antlr attempts to compute the

2779

number of possible elements in a set that is order of

2780

number-of-tokens raised to the number-of-lookahead-tokens power.

2781

For large grammars or large lookahead (e.g. -ck 7) this can

2782

cause arithmetic overflow.

2783

2784

With 1.33MR9, arithmetic overflow in this computation is reported

2785

the first time it happens. The program continues to run and

2786

the program branches based on the assumption that the computed

2787

value is larger than any number computed by counting actual cases

2788

because 2**31 is larger than the number of bits in most computers.

2789

2790

Before 1.33MR9 overflow was not reported. The behavior following

2791

overflow is not predictable by anyone but the original author.

2792

2793

NOTE

2794

2795

In 1.33MR10 the warning message is suppressed.

2796

The code which detects the overflow allows the

2797

computation to continue without an error. The

2798

error message itself made made users worry.

2799

2800

#76. (Changed in 1.33MR9) Jeff Vincent (JVincent@novell.com)

2801

2802

Jeff Vincent has convinced me to make ANTLRCommonToken and

2803

ANTLRCommonNoRefCountToken use variable length strings

2804

allocated from the heap rather than fixed length strings.

2805

By suitable definition of setText(), the copy constructor,

2806

and operator =() it is possible to maintain "copy" semantics.

2807

By "copy" semantics I mean that when a token is copied from

2808

an existing token it receives its own, distinct, copy of the

2809

text allocated from the heap rather than simply a pointer

2810

to the original token's text.

2811

2812

============================================================

2813

W * A * R * N * I * N * G

2814

============================================================

2815

2816

It is possible that this may cause problems for some users.

2817

For those users I have included the old version of AToken.h as

2818

pccts/h/AToken_traditional.h.

2819

2820

#75. (Changed in 1.33MR9) Bruce Guenter (bruceg@qcc.sk.ca)

2821

2822

Make DLGStringInput const correct. Since this is infrequently

2823

subclassed, it should affect few users, I hope.

2824

2825

#74. (Changed in 1.33MR9) -o (output directory) option

2826

2827

Antlr does not properly handle the -o output directory option

2828

when the filename of the grammar contains a directory part. For

2829

example:

2830

2831

antlr -o outdir pccts_src/myfile.g

2832

2833

causes antlr create a file called "outdir/pccts_src/myfile.cpp.

2834

It SHOULD create outdir/myfile.cpp

2835

2836

The suggested code fix has been installed in antlr, dlg, and

2837

Sorcerer.

2838

2839

#73. (Changed in 1.33MR9) Hoisting of semantic predicates and -mrhoist

2840

2841

============================================

2842

This has been rendered obsolete by Item #117

2843

============================================

2844

2845

#72. (Changed in 1.33MR9) virtual saveState()/restoreState()/guess_XXX

2846

2847

The following methods in ANTLRParser were made virtual at

2848

the request of S. Bochnak (S.Bochnak@microTool.com.pl):

2849

2850

saveState() and restoreState()

2851

guess(), guess_fail(), and guess_done()

2852

2853

#71. (Changed in 1.33MR9) Access to omitted command line argument

2854

2855

If a switch requiring arguments is the last thing on the

2856

command line, and the argument is omitted, antlr would core.

2857

2858

antlr test.g -prc

2859

2860

instead of

2861

2862

antlr test.g -prc off

2863

2864

#70. (Changed in 1.33MR9) Addition of MSVC .dsp and .mak build files

2865

2866

The following MSVC .dsp and .mak files for pccts and sorcerer

2867

were contributed by Stanislaw Bochnak (S.Bochnak@microTool.com.pl)

2868

and Jeff Vincent (JVincent@novell.com)

2869

2870

PCCTS Distribution Kit

2871

----------------------

2872

pccts/PCCTSMSVC50.dsw

2873

2874

pccts/antlr/AntlrMSVC50.dsp

2875

pccts/antlr/AntlrMSVC50.mak

2876

2877

pccts/dlg/DlgMSVC50.dsp

2878

pccts/dlg/DlgMSVC50.mak

2879

2880

pccts/support/msvc.dsp

2881

2882

Sorcerer Distribution Kit

2883

-------------------------

2884

pccts/sorcerer/SorcererMSVC50.dsp

2885

pccts/sorcerer/SorcererMSVC50.mak

2886

2887

pccts/sorcerer/lib/msvc.dsp

2888

2889

#69. (Changed in 1.33MR9) Change "unsigned int" to plain "int"

2890

2891

Declaration of max_token_num in misc.c as "unsigned int"

2892

caused comparison between signed and unsigned ints giving

2893

warning message without any special benefit.

2894

2895

#68. (Changed in 1.33MR9) Add void return for dlg internal_error()

2896

2897

Get rid of "no return value" message in internal_error()

2898

in file dlg/support.c and dlg/dlg.h.

2899

2900

#67. (Changed in Sor) sor.g: lisp() has no return value

2901

2902

Added a "void" for the return type.

2903

2904

#66. (Added to Sor) sor.g: ZZLEXBUFSIZE enclosed in #ifndef/#endif

2905

2906

A user needed to be able to change the ZZLEXBUFSIZE for

2907

sor. Put the definition of ZZLEXBUFSIZE inside #ifndef/#endif

2908

2909

#65. (Changed in 1.33MR9) PCCTSAST::deepCopy() and ast_dup() bug

2910

2911

Jeff Vincent (JVincent@novell.com) found that deepCopy()

2912

made new copies of only the direct descendents. No new

2913

copies were made of sibling nodes, Sibling pointers are

2914

set to zero by shallowCopy().

2915

2916

PCCTS_AST::deepCopy() has been changed to make a

2917

deep copy in the traditional sense.

2918

2919

The deepCopy() routine depends on the behavior of

2920

shallowCopy(). In all sor examples I've found,

2921

shallowCopy() zeroes the right and down pointers.

2922

2923

Original Tree Original deepCopy() Revised deepCopy

2924

------------- ------------------- ----------------

2925

a->b->c A A

2926

| | |

2927

d->e->f D D->E->F

2928

| | |

2929

g->h->i G G->H->I

2930

| |

2931

j->k J->K

2932

2933

While comparing deepCopy() for C++ mode with ast_dup for

2934

C mode I found a problem with ast_dup().

2935

2936

Routine ast_dup() has been changed to make a deep copy

2937

in the traditional sense.

2938

2939

Original Tree Original ast_dup() Revised ast_dup()

2940

------------- ------------------- ----------------

2941

a->b->c A->B->C A

2942

| | |

2943

d->e->f D->E->F D->E->F

2944

| | |

2945

g->h->i G->H->I G->H->I

2946

| | |

2947

j->k J->K J->K

2948

2949

2950

I believe this affects transform mode sorcerer programs only.

2951

2952

#64. (Changed in 1.33MR9) anltr/hash.h prototype for killHashTable()

2953

2954

#63. (Changed in 1.33MR8) h/charptr.h does not zero pointer after free

2955

2956

The charptr.h routine now zeroes the pointer after free().

2957

2958

Reported by Jens Tingleff (jensting@imaginet.fr)

2959

2960

#62. (Changed in 1.33MR8) ANTLRParser::resynch had static variable

2961

2962

The static variable "consumed" in ANTLRParser::resynch was

2963

changed into an instance variable of the class with the

2964

name "resynchConsumed".

2965

2966

Reported by S.Bochnak@microTool.com.pl

2967

2968

#61. (Changed in 1.33MR8) Using rule>[i,j] when rule has no return values

2969

2970

Previously, the following code would cause antlr to core when

2971

it tried to generate code for rule1 because rule2 had no return

2972

values ("upward inheritance"):

2973

2974

rule1 : <<int i; int j>>

2975

rule2 > [i,j]

2976

;

2977

2978

rule2 : Anything ;

2979

2980

Reported by S.Bochnak@microTool.com.pl

2981

2982

Verified correct operation of antlr MR8 when missing or extra

2983

inheritance arguments for all combinations. When there are

2984

missing or extra arguments code will still be generated even

2985

though this might cause the invocation of a subroutine with

2986

the wrong number of arguments.

2987

2988

#60. (Changed in 1.33MR7) Major changes to exception handling

2989

2990

There were significant problems in the handling of exceptions

2991

in 1.33 vanilla. The general problem is that it can only

2992

process one level of exception handler. For example, a named

2993

exception handler, an exception handler for an alternative, or

2994

an exception for a subrule always went to the rule's exception

2995

handler if there was no "catch" which matched the exception.

2996

2997

In 1.33MR7 the exception handlers properly "nest". If an

2998

exception handler does not have a matching "catch" then the

2999

nextmost outer exception handler is checked for an appropriate

3000

"catch" clause, and so on until an exception handler with an

3001

appropriate "catch" is found.

3002

3003

There are still undesirable features in the way exception

3004

handlers are implemented, but I do not have time to fix them

3005

at the moment:

3006

3007

The exception handlers for alternatives are outside the

3008

block containing the alternative. This makes it impossible

3009

to access variables declared in a block or to resume the

3010

parse by "falling through". The parse can still be easily

3011

resumed in other ways, but not in the most natural fashion.

3012

3013

This results in an inconsistentcy between named exception

3014

handlers and exception handlers for alternatives. When

3015

an exception handler for an alternative "falls through"

3016

it goes to the nextmost outer handler - not the "normal

3017

action".

3018

3019

A major difference between 1.33MR7 and 1.33 vanilla is

3020

the default action after an exception is caught:

3021

3022

1.33 Vanilla

3023

------------

3024

In 1.33 vanilla the signal value is set to zero ("NoSignal")

3025

and the code drops through to the code following the exception.

3026

For named exception handlers this is the "normal action".

3027

For alternative exception handlers this is the rule's handler.

3028

3029

1.33MR7

3030

-------

3031

In 1.33MR7 the signal value is NOT automatically set to zero.

3032

3033

There are two cases:

3034

3035

For named exception handlers: if the signal value has been

3036

set to zero the code drops through to the "normal action".

3037

3038

For all other cases the code branches to the nextmost outer

3039

exception handler until it reaches the handler for the rule.

3040

3041

The following macros have been defined for convenience:

3042

3043

C/C++ Mode Name

3044

--------------------

3045

(zz)suppressSignal

3046

set signal & return signal arg to 0 ("NoSignal")

3047

(zz)setSignal(intValue)

3048

set signal & return signal arg to some value

3049

(zz)exportSignal

3050

copy the signal value to the return signal arg

3051

3052

I'm not sure why PCCTS make a distinction between the local

3053

signal value and the return signal argument, but I'm loathe

3054

to change the code. The burden of copying the local signal

3055

value to the return signal argument can be given to the

3056

default signal handler, I suppose.

3057

3058

#59. (Changed in 1.33MR7) Prototypes for some functions

3059

3060

Added prototypes for the following functions to antlr.h

3061

3062

zzconsumeUntil()

3063

zzconsumeUntilToken()

3064

3065

#58. (Changed in 1.33MR7) Added defintion of zzbufsize to dlgauto.h

3066

3067

#57. (Changed in 1.33MR7) Format of #line directive

3068

3069

Previously, the -gl directive for line 1234 would

3070

resemble: "# 1234 filename.g". This caused problems

3071

for some compilers/pre-processors. In MR7 it generates

3072

"#line 1234 filename.g".

3073

3074

#56. (Added in 1.33MR7) Jan Mikkelsen <janm@zeta.org.au>

3075

3076

Move PURIFY macro invocaton to after rule's init action.

3077

3078

#55. (Fixed in 1.33MR7) Unitialized variables in ANTLRParser

3079

3080

Member variables inf_labase and inf_last were not initialized.

3081

(See item #50.)

3082

3083

#54. (Fixed in 1.33MR6) Brad Schick (schick@interacess.com)

3084

3085

Previously, the following constructs generated the same

3086

code:

3087

3088

rule1 : (A B C)?

3089

| something-else

3090

;

3091

3092

rule2 : (A B C)? ()

3093

| something-else

3094

;

3095

3096

In all versions of pccts rule1 guesses (A B C) and then

3097

consume all three tokens if the guess succeeds. In MR6

3098

rule2 guesses (A B C) but consumes NONE of the tokens

3099

when the guess succeeds because "()" matches epsilon.

3100

3101

#53. (Explanation for 1.33MR6) What happens after an exception is caught ?

3102

3103

The Book is silent about what happens after an exception

3104

is caught.

3105

3106

The following code fragment prints "Error Action" followed

3107

by "Normal Action".

3108

3109

test : Word ex:Number <<printf("Normal Action\n");>>

3110

exception[ex]

3111

catch NoViableAlt:

3112

<<printf("Error Action\n");>>

3113

;

3114

3115

The reason for "Normal Action" is that the normal flow of the

3116

program after a user-written exception handler is to "drop through".

3117

In the case of an exception handler for a rule this results in

3118

the exection of a "return" statement. In the case of an

3119

exception handler attached to an alternative, rule, or token

3120

this is the code that would have executed had there been no

3121

exception.

3122

3123

The user can achieve the desired result by using a "return"

3124

statement.

3125

3126

test : Word ex:Number <<printf("Normal Action\n");>>

3127

exception[ex]

3128

catch NoViableAlt:

3129

<<printf("Error Action\n"); return;>>

3130

;

3131

3132

The most powerful mechanism for recovery from parse errors

3133

in pccts is syntactic predicates because they provide

3134

backtracking. Exceptions allow "return", "break",

3135

"consumeUntil(...)", "goto _handler", "goto _fail", and

3136

changing the _signal value.

3137

3138

#52. (Fixed in 1.33MR6) Exceptions without syntactic predicates

3139

3140

The following generates bad code in 1.33 if no syntactic

3141

predicates are present in the grammar.

3142

3143

test : Word ex:Number <<printf("Normal Action\n");>>

3144

exception[ex]

3145

catch NoViableAlt:

3146

<<printf("Error Action\n");>>

3147

3148

There is a reference to a guess variable. In C mode

3149

this causes a compiler error. In C++ mode it generates

3150

an extraneous check on member "guessing".

3151

3152

In MR6 correct code is generated for both C and C++ mode.

3153

3154

#51. (Added to 1.33MR6) Exception operator "@" used without exceptions

3155

3156

In MR6 added a warning when the exception operator "@" is

3157

used and no exception group is defined. This is probably

3158

a case where "\@" or "@" is meant.

3159

3160

#50. (Fixed in 1.33MR6) Gunnar Rxnning (gunnar@candleweb.no)

3161

http://www.candleweb.no/~gunnar/

3162

3163

Routines zzsave_antlr_state and zzrestore_antlr_state don't

3164

save and restore all the data needed when switching states.

3165

3166

Suggested patch applied to antlr.h and err.h for MR6.

3167

3168

#49. (Fixed in 1.33MR6) Sinan Karasu (sinan@boeing.com)

3169

3170

Generated code failed to turn off guess mode when leaving a

3171

(...)+ block which contained a guess block. The result was

3172

an infinite loop. For example:

3173

3174

rule : (

3175

(x)?

3176

| y

3177

3178

3179

Suggested code fix implemented in MR6. Replaced

3180

3181

... else if (zzcnt>1) break;

3182

3183

with:

3184

3185

C++ mode:

3186

... else if (zzcnt>1) {if (!zzrv) zzGUESS_DONE; break;};

3187

C mode:

3188

... else if (zzcnt>1) {if (zzguessing) zzGUESS_DONE; break;};

3189

3190

#48. (Fixed in 1.33MR6) Invalid exception element causes core

3191

3192

A label attached to an invalid construct can cause

3193

pccts to crash while processing the exception associated

3194

with the label. For example:

3195

3196

rule : t:(B C)

3197

exception[t] catch MismatchedToken: <<printf(...);>>

3198

3199

Version MR6 generates the message:

3200

3201

reference in exception handler to undefined label 't'

3202

3203

#47. (Fixed in 1.33MR6) Manuel Ornato

3204

3205

Under some circumstances involving a k >1 or ck >1

3206

grammar and a loop block (i.e. (...)* ) pccts will

3207

fail to detect a syntax error and loop indefinitely.

3208

The problem did not exist in 1.20, but has existed

3209

from 1.23 to the present.

3210

3211

Fixed in MR6.

3212

3213

---------------------------------------------------

3214

Complete test program

3215

---------------------------------------------------

3216

#header<<

3217

#include <stdio.h>

3218

#include "charptr.h"

3219

3220

3221

3222

#include "charptr.c"

3223

main ()

3224

{

3225

ANTLR(global(),stdin);

3226

}

3227

3228

3229

#token "[\ \t]+" << zzskip(); >>

3230

#token "[\n]" << zzline++; zzskip(); >>

3231

3232

#token B "b"

3233

#token C "c"

3234

#token D "d"

3235

#token E "e"

3236

#token LP "\("

3237

#token RP "\)"

3238

3239

#token ANTLREOF "@"

3240

3241

global : (

3242

(E liste)

3243

| liste

3244

| listed

3245

) ANTLREOF

3246

;

3247

3248

listeb : LP ( B ( B | C )* ) RP ;

3249

listec : LP ( C ( B | C )* ) RP ;

3250

listed : LP ( D ( B | C )* ) RP ;

3251

liste : ( listeb | listec )* ;

3252

3253

---------------------------------------------------

3254

Sample data causing infinite loop

3255

---------------------------------------------------

3256

e (d c)

3257

---------------------------------------------------

3258

3259

#46. (Fixed in 1.33MR6) Robert Richter

3260

(Robert.Richter@infotech.tu-chemnitz.de)

3261

3262

This item from the list of known problems was

3263

fixed by item #18 (below).

3264

3265

#45. (Fixed in 1.33MR6) Brad Schick (schick@interaccess.com)

3266

3267

The dependency scanner in VC++ mistakenly sees a

3268

reference to an MPW #include file even though properly

3269

#ifdef/#endif in config.h. The suggested workaround

3270

has been implemented:

3271

3272

#ifdef MPW

3273

.....

3274

#define MPW_CursorCtl_Header <CursorCtl.h>

3275

#include MPW_CursorCtl_Header

3276

.....

3277

#endif

3278

3279

#44. (Fixed in 1.33MR6) cast malloc() to (char *) in charptr.c

3280

3281

Added (char *) cast for systems where malloc returns "void *".

3282

3283

#43. (Added to 1.33MR6) Bruce Guenter (bruceg@qcc.sk.ca)

3284

3285

Add setLeft() and setUp methods to ASTDoublyLinkedBase

3286

for symmetry with setRight() and setDown() methods.

3287

3288

#42. (Fixed in 1.33MR6) Jeff Katcher (jkatcher@nortel.ca)

3289

3290

C++ style comment in antlr.c corrected.

3291

3292

#41. (Added in 1.33MR6) antlr -stdout

3293

3294

Using "antlr -stdout ..." forces the text that would

3295

normally go to the grammar.c or grammar.cpp file to

3296

stdout.

3297

3298

#40. (Added in 1.33MR6) antlr -tab to change tab stops

3299

3300

Using "antlr -tab number ..." changes the tab stops

3301

for the grammar.c or grammar.cpp file. The number

3302

must be between 0 and 8. Using 0 gives tab characters,

3303

values between 1 and 8 give the appropriate number of

3304

space characters.

3305

3306

#39. (Fixed in 1.33MR5) Jan Mikkelsen <janm@zeta.org.au>

3307

3308

Commas in function prototype still not correct under

3309

some circumstances. Suggested code fix installed.

3310

3311

#38. (Fixed in 1.33MR5) ANTLRTokenBuffer constructor

3312

3313

Have ANTLRTokenBuffer ctor initialize member "parser" to null.

3314

3315

#37. (Fixed in 1.33MR4) Bruce Guenter (bruceg@qcc.sk.ca)

3316

3317

In ANTLRParser::FAIL(int k,...) released memory pointed to by

3318

f[i] (as well as f itself. Should only free f itself.

3319

3320

#36. (Fixed in 1.33MR3) Cortland D. Starrett (cort@shay.ecn.purdue.edu)

3321

3322

Neglected to properly declare isDLGmaxToken() when fixing problem

3323

reported by Andreas Magnusson.

3324

3325

Undo "_retv=NULL;" change which caused problems for return values

3326

from rules whose return values weren't pointers.

3327

3328

Failed to create bin directory if it didn't exist.

3329

3330

#35. (Fixed in 1.33MR2) Andreas Magnusson

3331

(Andreas.Magnusson@mailbox.swipnet.se)

3332

3333

Repair bug introduced by 1.33MR1 for #tokdefs. The original fix

3334

placed "DLGmaxToken=9999" and "DLGminToken=0" in the TokenType enum

3335

in order to fix a problem with an aggresive compiler assigning an 8

3336

bit enum which might be too narrow. This caused #tokdefs to assume

3337

that there were 9999 real tokens. The repair to the fix causes antlr to

3338

ignore TokenTypes "DLGmaxToken" and "DLGminToken" in a #tokdefs file.

3339

3340

#34. (Added to 1.33MR1) Add public DLGLexerBase::set_line(int newValue)

3341

3342

Previously there was no public function for changing the line

3343

number maintained by the lexer.

3344

3345

#33. (Fixed in 1.33MR1) Franklin Chen (chen@adi.com)

3346

3347

Accidental use of EXIT_FAILURE rather than PCCTS_EXIT_FAILURE

3348

in pccts/h/AParser.cpp.

3349

3350

#32. (Fixed in 1.33MR1) Franklin Chen (chen@adi.com)

3351

3352

In PCCTSAST.cpp lines 405 and 466: Change

3353

3354

free (t)

3355

3356

free ( (char *)t );

3357

3358

to match prototype.

3359

3360

#31. (Added to 1.33MR1) Pointer to parser in ANTLRTokenBuffer

3361

Pointer to parser in DLGLexerBase

3362

3363

The ANTLRTokenBuffer class now contains a pointer to the

3364

parser which is using it. This is established by the

3365

ANTLRParser constructor calling ANTLRTokenBuffer::

3366

setParser(ANTLRParser *p).

3367

3368

When ANTLRTokenBuffer::setParser(ANTLRParser *p) is

3369

called it saves the pointer to the parser and then

3370

calls ANTLRTokenStream::setParser(ANTLRParser *p)

3371

so that the lexer can also save a pointer to the

3372

parser.

3373

3374

There is also a function getParser() in each class

3375

with the obvious purpose.

3376

3377

It is possible that these functions will return NULL

3378

under some circumstances (e.g. a non-DLG lexer is used).

3379

3380

#30. (Added to 1.33MR1) function tokenName(int token) standard

3381

3382

The generated parser class now includes the

3383

function:

3384

3385

static const ANTLRChar * tokenName(int token)

3386

3387

which returns a pointer to the "name" corresponding

3388

to the token.

3389

3390

The base class (ANTLRParser) always includes the

3391

member function:

3392

3393

const ANTLRChar * parserTokenName(int token)

3394

3395

which can be accessed by objects which have a pointer

3396

to an ANTLRParser, but do not know the name of the

3397

parser class (e.g. ANTLRTokenBuffer and DLGLexerBase).

3398

3399

#29. (Added to 1.33MR1) Debugging DLG lexers

3400

3401

If the pre-processor symbol DEBUG_LEXER is defined

3402

then DLexerBase will include code for printing out

3403

key information about tokens which are recognized.

3404

3405

The debug feature of the lexer is controlled by:

3406

3407

int previousDebugValue=lexer.debugLexer(newValue);

3408

3409

a value of 0 disables output

3410

a value of 1 enables output

3411

3412

Even if the lexer debug code is compiled into DLexerBase

3413

it must be enabled before any output is generated. For

3414

example:

3415

3416

DLGFileInput in(stdin);

3417

MyDLG lexer(&in,2000);

3418

3419

lexer.setToken(&aToken);

3420

3421

#if DEBUG_LEXER

3422

lexer.debugLexer(1); // enable debug information

3423

#endif

3424

3425

#28. (Added to 1.33MR1) More control over DLG header

3426

3427

Version 1.33MR1 adds the following directives to PCCTS

3428

for C++ mode:

3429

3430

#lexprefix <<source code>>

3431

3432

Adds source code to the DLGLexer.h file

3433

after the #include "DLexerBase.h" but

3434

before the start of the class definition.

3435

3436

#lexmember <<source code>>

3437

3438

Adds source code to the DLGLexer.h file

3439

as part of the DLGLexer class body. It

3440

appears immediately after the start of

3441

the class and a "public: statement.

3442

3443

#27. (Fixed in 1.33MR1) Comments in DLG actions

3444

3445

Previously, DLG would not recognize comments as a special case.

3446

Thus, ">>" in the comments would cause errors. This is fixed.

3447

3448

#26. (Fixed in 1.33MR1) Removed static variables from error routines

3449

3450

Previously, the existence of statically allocated variables

3451

in some of the parser's member functions posed a danger when

3452

there was more than one parser active.

3453

3454

Replaced with dynamically allocated/freed variables in 1.33MR1.

3455

3456

#25. (Fixed in 1.33MR1) Use of string literals in semantic predicates

3457

3458

Previously, it was not possible to place a string literal in

3459

a semantic predicate because it was not properly "stringized"

3460

for the report of a failed predicate.

3461

3462

#24. (Fixed in 1.33MR1) Continuation lines for semantic predicates

3463

3464

Previously, it was not possible to continue semantic

3465

predicates across a line because it was not properly

3466

"stringized" for the report of a failed predicate.

3467

3468

rule : <<ifXYZ()>>?[ a very

3469

long statement ]

3470

3471

#23. (Fixed in 1.33MR1) {...} envelope for failed semantic predicates

3472

3473

Previously, there was a code generation error for failed

3474

semantic predicates:

3475

3476

rule : <<xyz()>>?[ stmt1; stmt2; ]

3477

3478

which generated code which resembled:

3479

3480

if (! xyz()) stmt1; stmt2;

3481

3482

It now puts the statements in a {...} envelope:

3483

3484

if (! xyz()) { stmt1; stmt2; };

3485

3486

#22. (Fixed in 1.33MR1) Continuation of #token across lines using "\"

3487

3488

Previously, it was not possible to continue a #token regular

3489

expression across a line. The trailing "\" and newline caused

3490

a newline to be inserted into the regular expression by DLG.

3491

3492

Fixed in 1.33MR1.

3493

3494

#21. (Fixed in 1.33MR1) Use of ">>" (right shift operator in DLG actions

3495

3496

It is now possible to use the C++ right shift operator ">>"

3497

in DLG actions by using the normal escapes:

3498

3499

#token "shift-right" << value=value \>\> 1;>>

3500

3501

#20. (Version 1.33/19-Jan-97 Karl Eccleson <karle@microrobotics.co.uk>

3502

P.A. Keller (P.A.Keller@bath.ac.uk)

3503

3504

There is a problem due to using exceptions with the -gh option.

3505

3506

Suggested fix now in 1.33MR1.

3507

3508

#19. (Fixed in 1.33MR1) Tom Piscotti and John Lilley

3509

3510

There were problems suppressing messages to stdin and stdout

3511

when running in a window environment because some functions

3512

which uses fprint were not virtual.

3513

3514

Suggested change now in 1.33MR1.

3515

3516

I believe all functions containing error messages (excluding those

3517

indicating internal inconsistency) have been placed in functions

3518

which are virtual.

3519

3520

#18. (Version 1.33/ 22-Nov-96) John Bair (jbair@iftime.com)

3521

3522

Under some combination of options a required "return _retv" is

3523

not generated.

3524

3525

Suggested fix now in 1.33MR1.

3526

3527

#17. (Version 1.33/3-Sep-96) Ron House (house@helios.usq.edu.au)

3528

3529

The routine ASTBase::predorder_action omits two "tree->"

3530

prefixes, which results in the preorder_action belonging

3531

to the wrong node to be invoked.

3532

3533

Suggested fix now in 1.33MR1.

3534

3535

#16. (Version 1.33/7-Jun-96) Eli Sternheim <eli@interhdl.com>

3536

3537

Routine consumeUntilToken() does not check for end-of-file

3538

condition.

3539

3540

Suggested fix now in 1.33MR1.

3541

3542

#15. (Version 1.33/8 Apr 96) Asgeir Olafsson <olafsson@cstar.ac.com>

3543

3544

Problem with tree duplication of doubly linked ASTs in ASTBase.cpp.

3545

3546

Suggested fix now in 1.33MR1.

3547

3548

#14. (Version 1.33/28-Feb-96) Andreas.Magnusson@mailbox.swipnet.se

3549

3550

Problem with definition of operator = (const ANTLRTokenPtr rhs).

3551

3552

Suggested fix now in 1.33MR1.

3553

3554

#13. (Version 1.33/13-Feb-96) Franklin Chen (chen@adi.com)

3555

3556

Sun C++ Compiler 3.0.1 can't compile testcpp/1 due to goto in

3557

block with destructors.

3558

3559

Apparently fixed. Can't locate "goto".

3560

3561

#12. (Version 1.33/10-Nov-95) Minor problems with 1.33 code

3562

3563

The following items have been fixed in 1.33MR1:

3564

3565

1. pccts/antlr/main.c line 142

3566

3567

"void" appears in classic C code

3568

3569

2. no makefile in support/genmk

3570

3571

3. EXIT_FAILURE/_SUCCESS instead of PCCTS_EXIT_FAILURE/_SUCCESS

3572

3573

pccts/h/PCCTSAST.cpp

3574

pccts/h/DLexerBase.cpp

3575

pccts/testcpp/6/test.g

3576

3577

4. use of "signed int" isn't accepted by AT&T cfront

3578

3579

pccts/h/PCCTSAST.h line 42

3580

3581

5. in call to ANTLRParser::FAIL the var arg err_k is passed as

3582

"int" but is declared "unsigned int".

3583

3584

6. I believe that a failed validation predicate still does not

3585

get put in a "{...}" envelope, despite the release notes.

3586

3587

7. The #token ">>" appearing in the DLG grammar description

3588

causes DLG to generate the string literal "\>\>" which

3589

is non-conforming and will cause some compilers to

3590

complain (scan.c function act10 line 143 of source code).

3591

3592

#11. (Version 1.32b6) Dave Kuhlman (dkuhlman@netcom.com)

3593

3594

Problem with file close in gen.c. Already fixed in 1.33.

3595

3596

#10. (Version 1.32b6/29-Aug-95)

3597

3598

pccts/antlr/main.c contains a C++ style comments on lines 149

3599

and 176 which causes problems for most C compilers.

3600

3601

Already fixed in 1.33.

3602

3603

#9. (Version 1.32b4/14-Mar-95) dlgauto.h #include "config.h"

3604

3605

The file pccts/h/dlgauto.h should probably contain a #include

3606

"config.h" as it uses the #define symbol __USE_PROTOS.

3607

3608

Added to 1.33MR1.

3609

3610

#8. (Version 1.32b4/6-Mar-95) Michael T. Richter (mtr@igs.net)

3611

3612

In C++ output mode anonymous tokens from in-line regular expressions

3613

can create enum values which are too wide for the datatype of the enum

3614

assigned by the C++ compiler.

3615

3616

Fixed in 1.33MR1.

3617

3618

#7. (Version 1.32b4/6-Mar-95) C++ does not imply __STDC__

3619

3620

In err.h the combination of # directives assumes that a C++

3621

compiler has __STDC__ defined. This is not necessarily true.

3622

3623

This problem also appears in the use of __USE_PROTOS which

3624

is appropriate for both Standard C and C++ in antlr/gen.c

3625

and antlr/lex.c

3626

3627

Fixed in 1.33MR1.

3628

3629

#6. (Version 1.32 ?/15-Feb-95) Name conflict for "TokenType"

3630

3631

Already fixed in 1.33.

3632

3633

#5. (23-Jan-95) Douglas_Cuthbertson.JTIDS@jtids_qmail.hanscom.af.mil

3634

3635

The fail action following a semantic predicate is not enclosed in

3636

"{...}". This can lead to problems when the fail action contains

3637

more than one statement.

3638

3639

Fixed in 1.33MR1.

3640

3641

#4 . (Version 1.33/31-Mar-96) jlilley@empathy.com (John Lilley)

3642

3643

Put briefly, a semantic predicate ought to abort a guess if it fails.

3644

3645

Correction suggested by J. Lilley has been added to 1.33MR1.

3646

3647

#3 . (Version 1.33) P.A.Keller@bath.ac.uk

3648

3649

Extra commas are placed in the K&R style argument list for rules

3650

when using both exceptions and ASTs.

3651

3652

Fixed in 1.33MR1.

3653

3654

#2. (Version 1.32b6/2-Oct-95) Brad Schick <schick@interaccess.com>

3655

3656

Construct #[] generates zzastnew() in C++ mode.

3657

3658

Already fixed in 1.33.

3659

3660

#1. (Version 1.33) Bob Bailey (robert@oakhill.sps.mot.com)

3661

3662

Previously, config.h assumed that all PC systems required

3663

"short" file names. The user can now override that

3664

assumption with "#define LONGFILENAMES".

3665

3666

Added to 1.33MR1.

Older »