1
(**************************************************************************)
5
(* Fran�ois Pottier and Yann R�gis-Gianas, INRIA Rocquencourt *)
7
(* Copyright 2005 Institut National de Recherche en Informatique et *)
8
(* en Automatique. All rights reserved. This file is distributed *)
9
(* under the terms of the Q Public License version 1.0, with the *)
10
(* change described in file LICENSE. *)
12
(**************************************************************************)
16
(* ------------------------------------------------------------------------ *)
17
(* Symbolic lookahead information. *)
19
(* A symbolic lookahead set consists of an actual concrete set of
20
terminal symbols and of a number of set variables. Set variables as
21
encoded as integers. *)
23
module SymbolicLookahead = struct
26
TerminalSet.t * CompressedBitSet.t
29
(toks, CompressedBitSet.empty)
32
constant TerminalSet.empty
34
let union (toks1, vars1) ((toks2, vars2) as s2) =
35
let toks = TerminalSet.union toks1 toks2
36
and vars = CompressedBitSet.union vars1 vars2 in
37
if toks2 == toks && vars2 == vars then
42
let variable (var : int) : t =
43
(TerminalSet.empty, CompressedBitSet.singleton var)
45
let project (toks, vars) =
46
assert (CompressedBitSet.is_empty vars);
51
(* We will perform closure operations over symbolic lookahead sets.
52
This allows us to later represent LR(1) states as pairs of an
53
LR(0) node number and an array of concrete lookahead sets. *)
55
module SymbolicClosure =
56
Item.Closure(SymbolicLookahead)
58
(* Closure operations over concrete lookahead sets are also used (when
59
explaining conflicts). One could take another instance of the
60
functor. The approach below is somewhat less elegant and makes each
61
call to [closure] somewhat slower, but saves the cost of
62
instantiating the functor again -- which is linear in the size of
65
type concretelr1state =
66
TerminalSet.t Item.Map.t
68
let closure (state : concretelr1state) : concretelr1state =
69
Item.Map.map SymbolicLookahead.project
70
(SymbolicClosure.closure
71
(Item.Map.map SymbolicLookahead.constant state))
73
(* ------------------------------------------------------------------------ *)
74
(* Finding which non-epsilon transitions leave a set of items. This
75
code is parametric in the nature of lookahead sets. *)
77
let transitions (state : 'a Item.Map.t) : 'a Item.Map.t SymbolMap.t =
79
Item.Map.fold (fun item toks transitions ->
80
match Item.classify item with
81
| Item.Shift (symbol, item') ->
82
let items : 'a Item.Map.t =
84
SymbolMap.find symbol transitions
88
SymbolMap.add symbol (Item.Map.add item' toks items) transitions
91
) state SymbolMap.empty
93
(* ------------------------------------------------------------------------ *)
94
(* Determining the reduction opportunities at a (closed) state. They
95
are represented as a list of pairs of a lookahead set and a
96
production index. This code is again parametric in the nature of
99
let reductions (state : 'a Item.Map.t) : ('a * Production.index) list =
100
Item.Map.fold (fun item toks accu ->
101
match Item.classify item with
102
| Item.Reduce prod ->
108
(* ------------------------------------------------------------------------ *)
109
(* Construction of the the LR(0) automaton. *)
111
(* Nodes are numbered sequentially. *)
116
(* A symbolic transition is a pair of the target state number and an
117
array of symbolic lookahead sets. The variables in these sets are
118
numbered in [0,g) where g is the number of items in the source
119
LR(0) state. Items are numbered in the order of presentation by
122
type symbolic_transition_target =
123
node * SymbolicLookahead.t array
125
(* The automaton is represented by (growing) arrays of states (sets of
126
items), symbolic transition information, and symbolic reduction
127
information, indexed by node numbers. Conversely, a hash table maps
128
states (sets of items) to node numbers. *)
133
let states : Item.Set.t InfiniteArray.t =
134
InfiniteArray.make Item.Set.empty
136
let _transitions : symbolic_transition_target SymbolMap.t InfiniteArray.t =
137
InfiniteArray.make SymbolMap.empty
139
let _reductions : (SymbolicLookahead.t * Production.index) list InfiniteArray.t =
140
InfiniteArray.make []
142
let map : (Item.Set.t, node) Hashtbl.t =
145
(* The automaton is built depth-first. *)
147
let rec explore (state : Item.Set.t) : node =
149
(* Find out whether this state was already explored. *)
152
Hashtbl.find map state
155
(* If not, create a new node. *)
159
InfiniteArray.set states k state;
160
Hashtbl.add map state k;
162
(* Build a symbolic version of the current state, where each item
163
is associated with a distinct lookahead set variable, numbered
166
let (_ : int), (symbolic_state : SymbolicClosure.state) =
167
Item.Set.fold (fun item (i, symbolic_state) ->
168
i+1, Item.Map.add item (SymbolicLookahead.variable i) symbolic_state
169
) state (0, Item.Map.empty) in
171
(* Compute the symbolic closure. *)
173
let closure = SymbolicClosure.closure symbolic_state in
175
(* Compute symbolic information about reductions. *)
177
InfiniteArray.set _reductions k (reductions closure);
179
(* Compute symbolic information about the transitions, and, by
180
dropping the symbolic lookahead information, explore the
181
transitions to further LR(0) states. *)
183
InfiniteArray.set _transitions k (SymbolMap.map (fun symbolic_state ->
184
let (k : node) = explore (Item.Map.domain symbolic_state) in
185
let lookahead : SymbolicLookahead.t array =
186
Array.create (Item.Map.cardinal symbolic_state) SymbolicLookahead.empty in
187
let (_ : int) = Item.Map.fold (fun _ s i ->
190
) symbolic_state 0 in
191
((k, lookahead) : symbolic_transition_target)
192
) (transitions closure));
196
(* Creating a start state out of a start production. It contains a
197
single item, consisting of the start production, at position 0. *)
199
let start prod : Item.Set.t =
200
Item.Set.singleton (Item.import (prod, 0))
202
(* This starts the construction of the automaton and records the
203
entry nodes in an array. *)
205
let entry : node array =
206
Production.maps (fun prod ->
217
Error.logA 1 (fun f -> Printf.fprintf f "Built an LR(0) automaton with %d states.\n" n);
218
Time.tick "Construction of the LR(0) automaton"
220
(* ------------------------------------------------------------------------ *)
223
let items node : Item.Set.t =
224
InfiniteArray.get states node
226
(* ------------------------------------------------------------------------ *)
227
(* Help for building the LR(1) automaton. *)
229
(* An LR(1) state is represented as a pair of an LR(0) state number
230
and an array of concrete lookahead sets (whose length depends on
234
node * TerminalSet.t array
236
(* An encoded LR(1) state can be turned into a concrete representation,
237
that is, a mapping of items to concrete lookahead sets. *)
239
let export (k, toksr) =
240
let (_ : int), items = Item.Set.fold (fun item (i, items) ->
241
i+1, Item.Map.add item toksr.(i) items
242
) (InfiniteArray.get states k) (0, Item.Map.empty) in
245
(* Displaying a concrete state. *)
247
let print_concrete (state : concretelr1state) =
248
let buffer = Buffer.create 1024 in
249
Item.Map.iter (fun item toks ->
250
Printf.bprintf buffer "%s[ %s ]\n" (Item.print item) (TerminalSet.print toks)
252
Buffer.contents buffer
254
(* Displaying a state. By default, only the kernel is displayed, not
258
print_concrete (export state)
260
let print_closure state =
261
print_concrete (closure (export state))
263
(* The core of an LR(1) state is the underlying LR(0) state. *)
268
(* A sanity check. *)
270
let well_formed (k, toksr) =
271
Array.length toksr = Item.Set.cardinal (InfiniteArray.get states k)
273
(* An LR(1) start state is the combination of an LR(0) start state
274
(which consists of a single item) with a singleton lookahead set
275
that consists of the end-of-file pseudo-token. *)
278
let state = (k, [| TerminalSet.singleton Terminal.sharp |]) in
279
assert (well_formed state);
282
(* Interpreting a symbolic lookahead set with respect to a source
283
state. The variables in the symbolic lookahead set (which are
284
integers) are interpreted as indices into the state's array of
285
concrete lookahead sets. The result is a concrete lookahead set. *)
288
((_, toksr) as state : lr1state)
289
((toks, vars) : SymbolicLookahead.t)
292
assert (well_formed state);
293
CompressedBitSet.fold (fun var toks ->
294
assert (var >= 0 && var < Array.length toksr);
295
TerminalSet.union toksr.(var) toks
298
(* Out of an LR(1) state, one produces information about reductions
299
and transitions. This is done in an efficient way by interpreting
300
the precomputed symbolic information with respect to that state. *)
303
((k, _) as state : lr1state)
304
: (TerminalSet.t * Production.index) list =
306
List.map (fun (s, prod) ->
307
interpret state s, prod
308
) (InfiniteArray.get _reductions k)
311
((k, _) as state : lr1state)
312
: lr1state SymbolMap.t =
314
SymbolMap.map (fun ((k, sr) : symbolic_transition_target) ->
315
((k, Array.map (interpret state) sr) : lr1state)
316
) (InfiniteArray.get _transitions k)
322
SymbolMap.domain (InfiniteArray.get _transitions k)
326
((k, _) as state : lr1state)
329
let ((k, sr) : symbolic_transition_target) =
331
SymbolMap.find symbol (InfiniteArray.get _transitions k)
333
assert false (* no transition along this symbol *)
335
(k, Array.map (interpret state) sr)
337
(* Equality of states. *)
339
let equal ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
340
assert (k1 = k2 && well_formed state1 && well_formed state2);
346
(TerminalSet.equal toksr1.(i) toksr2.(i)) && (loop i)
348
loop (Array.length toksr1)
350
(* Subsumption between states. *)
352
let subsume ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
353
assert (k1 = k2 && well_formed state1 && well_formed state2);
359
(TerminalSet.subset toksr1.(i) toksr2.(i)) && (loop i)
361
loop (Array.length toksr1)
363
(* This function determines whether two (core-equivalent) states are
364
compatible, according to a criterion that is close to Pager's weak
365
compatibility criterion.
367
Pager's criterion guarantees that if a merged state has a potential
368
conflict at [(i, j)] -- that is, some token [t] appears within the
369
lookahead sets of both item [i] and item [j] -- then there exists a
370
state in the canonical automaton that also has a potential conflict
371
at [(i, j)] -- that is, some token [u] appears within the lookahead
372
sets of both item [i] and item [j]. Note that [t] and [u] can be
375
Pager has shown that his weak compatibility criterion is stable,
376
that is, preserved by transitions and closure. This means that, if
377
two states can be merged, then so can their successors. This is
378
important, because merging two states means committing to merging
379
their successors, even though we have not even built these
382
The criterion used here is a slightly more restrictive version of
383
Pager's criterion, which guarantees equality of the tokens [t] and
384
[u]. This is done essentially by applying Pager's original
385
criterion on a token-wise basis. Pager's original criterion states
386
that two states can be merged if the new state has no conflict or
387
one of the original states has a conflict. Our more restrictive
388
criterion states that two states can be merged if, for every token
389
[t], the new state has no conflict at [t] or one of the original
390
states has a conflict at [t].
392
This modified criterion is also stable. My experiments show that it
393
is almost as effective in practice: out of more than a hundred
394
real-world sample grammars, only one automaton was affected, and
395
only one extra state appeared as a result of using the modified
396
criterion. Its advantage is to potentially make conflict
397
explanations easier: if there appears to be a conflict at [t], then
398
some conflict at [t] can be explained. This was not true when using
399
Pager's original criterion. *)
401
let compatible (k1, toksr1) (k2, toksr2) =
403
let n = Array.length toksr1 in
404
(* Two states are compatible if and only if they are compatible
405
at every pair (i, j), where i and j are distinct. *)
410
let toksr1i = toksr1.(i)
411
and toksr2i = toksr2.(i) in
416
let toksr1j = toksr1.(j)
417
and toksr2j = toksr2.(j) in
419
(* The two states are compatible at (i, j) if every conflict
420
token in the merged state already was a conflict token in
421
one of the two original states. This could be written as
425
(TerminalSet.inter (TerminalSet.union toksr1i toksr2i) (TerminalSet.union toksr1j toksr2j))
426
(TerminalSet.union (TerminalSet.inter toksr1i toksr1j) (TerminalSet.inter toksr2i toksr2j))
428
but is easily seen (on paper) to be equivalent to:
433
(TerminalSet.inter toksr2i toksr1j)
434
(TerminalSet.union toksr1i toksr2j)
437
(TerminalSet.inter toksr1i toksr2j)
438
(TerminalSet.union toksr2i toksr1j)
442
loopj 0 && loopi (i+1)
446
(* This function determines whether two (core-equivalent) states can
447
be merged without creating an end-of-stream conflict, now or in the
450
The rule is, if an item appears in one state with the singleton "#"
451
as its lookahead set, then its lookahead set in the other state
454
So, either the second lookahead set is also the singleton "#", and
455
no end-of-stream conflict exists, or it is larger, and the second
456
state already contains an end-of-stream conflict.
458
Put another way, we do not to merge two lookahead sets when one
459
contains "#" alone and the other does not contain "#".
461
I invented this rule to complement Pager's criterion. I believe,
462
but I am not 100% sure, that it does indeed prevent end-of-stream
463
conflicts and that it is stable.
465
Thanks to S�bastien Hinderer for reporting the bug caused by the
466
absence of this extra criterion. *)
468
let eos_compatible (k1, toksr1) (k2, toksr2) =
470
let n = Array.length toksr1 in
475
let toks1 = toksr1.(i)
476
and toks2 = toksr2.(i) in
478
if TerminalSet.mem Terminal.sharp toks1 && TerminalSet.cardinal toks1 = 1 then
479
(* "#" is alone in one set: it must be a member of the other set. *)
480
TerminalSet.mem Terminal.sharp toks2
481
else if TerminalSet.mem Terminal.sharp toks2 && TerminalSet.cardinal toks2 = 1 then
482
(* Symmetric condition. *)
483
TerminalSet.mem Terminal.sharp toks1
491
(* Union of two states. The two states must have the same core. The
492
new state is obtained by pointwise union of the lookahead sets. *)
494
let union (k1, toksr1) (k2, toksr2) =
496
k1, Array.init (Array.length toksr1) (fun i ->
497
TerminalSet.union toksr1.(i) toksr2.(i)
500
(* Restriction of a state to a set of tokens of interest. Every
501
lookahead set is intersected with that set. *)
503
let restrict toks (k, toksr) =
504
k, Array.map (fun toksri ->
505
TerminalSet.inter toksri toks