~ubuntu-branches/ubuntu/feisty/menhir/feisty

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
(**************************************************************************)
(*                                                                        *)
(*  Menhir                                                                *)
(*                                                                        *)
(*  François Pottier and Yann Régis-Gianas, INRIA Rocquencourt            *)
(*                                                                        *)
(*  Copyright 2005 Institut National de Recherche en Informatique et      *)
(*  en Automatique. All rights reserved. This file is distributed         *)
(*  under the terms of the Q Public License version 1.0, with the         *)
(*  change described in file LICENSE.                                     *)
(*                                                                        *)
(**************************************************************************)

(* This module discovers information about the shape of the stack
   in each of the automaton's states.

   It would probably be possible to predict this information, instead
   of discovering it, but it would then be less obvious that it is
   correct. In this approach, it is more obvious that the discovered
   invariant is correct. The drawback is that it is not necessarily
   clear why it is strong enough to guarantee that the generated code
   is well-typed. *)

open Grammar

(* ------------------------------------------------------------------------ *)
(* Sets of nodes. *)

module NodeSet =
  Set.Make (struct
    type t = Lr1.node
    let compare node1 node2 = Lr1.number node1 - Lr1.number node2
  end)

(* ------------------------------------------------------------------------ *)
(* Discover what is known of the structure of the stack in every
   automaton node. This knowledge can be defined, to a first
   approximation, as the greatest common suffix of all paths that lead
   up to this node, where a path is viewed as a string of symbols.

   At each state, we distinguish two cases: either the stack is known
   entirely, or only a suffix of it is known. In other words, the part
   of the stack that is not described by the string of symbols is
   empty in the former case and unknown in the latter.

   We compute this information by beginning with an upper bound,
   namely the shortest path towards each node. The string that we are
   looking for is a substring of that path. Furthermore, we enforce
   the property that, when two paths join, they should agree about the
   structure of the stack at the join point. That is, whenever we find
   a new edge that leads to a known state, we take the greatest common
   suffix of the two strings that they correspond to.

   In addition to this information, we compute the set of states that
   are liable to appear in each stack cell. When two paths join, we
   take the union of the state sets that appear along the greatest
   common suffix. *)

type tail =
  | TailEmpty
  | TailUnknown

type cell =
    Symbol.t * NodeSet.t

type word =
    cell list

type stack =
    word * tail

(* ------------------------------------------------------------------------ *)
(* This is how we extend a stack with a new symbol. *)

let extend cell (w, tail) =
  cell :: w, tail

(* ------------------------------------------------------------------------ *)
(* This is how we compute the meet of two stacks. *)

let rec tmeet (tail1 : tail) (tail2 : tail) : tail =
  match tail1, tail2 with
  | TailEmpty, TailEmpty ->
      TailEmpty
  | TailUnknown, _
  | _, TailUnknown ->
      TailUnknown

let rec wmeet (w1 : word) (w2 : word) : word * tail =
  match w1, w2 with
  | [], [] ->
      (* Both stacks are empty. There is agreement. *)
      [], TailEmpty
  | [], _ :: _
  | _ :: _, [] ->
      (* One stack is empty, but the other isn't. The greatest
	 common suffix is empty, and there is disagreement. *)
      [], TailUnknown
  | (symbol1, states1) :: w1, (symbol2, states2) :: w2 ->
      if Symbol.equal symbol1 symbol2 then
	(* The stacks agree on their top cell. It is therefore part
	   of the greatest common suffix. *)
        let w, tail = wmeet w1 w2 in
	(symbol1, NodeSet.union states1 states2) :: w, tail
      else
        (* The stacks disagree on their top cell. Their greatest common
	   suffix is therefore empty. *)
	[], TailUnknown

let rec meet (stk1 : stack) (stk2 : stack) : stack =
  let w1, tail1 = stk1
  and w2, tail2 = stk2 in
  let w, tail = wmeet w1 w2 in
  w, tmeet tail (tmeet tail1 tail2)

(* ------------------------------------------------------------------------ *)
(* This is how we truncate a stack at a certain depth. *)

let truncate depth (w, tail) =
  assert (List.length w >= depth);
  Misc.truncate depth w, TailUnknown

(* ------------------------------------------------------------------------ *)
(* This is our initial estimation of the stack at every node. It will remain
   unmodified at entry nodes, but will be overwritten without consideration
   at all other nodes. *)

let empty : stack =
  [], TailEmpty

(* ------------------------------------------------------------------------ *)
(* This is a mapping of automaton nodes to stacks. *)

let stacks : stack array =
  Array.make Lr1.n empty

let stack node =
  stacks.(Lr1.number node)

let set_stack node h =
  stacks.(Lr1.number node) <- h

(* ------------------------------------------------------------------------ *)
(* Here comes the initial discovery phase. *)

let () =
  Lr1.bfs (fun discovery source symbol target ->

    (* There is a transition from [source] to [target] labeled
       [symbol]. This yields a new description of the stack at the
       target node, where the top cell holds [symbol] and [source],
       and where the structure of the remainder of the stack is the
       structure of the stack at the [source] node. *)

    let stk = extend (symbol, NodeSet.singleton source) (stack source) in

    (* Define or update the structure of the stack at the [target]
       node, depending on whether this node was newly discovered. *)

    set_stack target (if discovery then stk else meet (stack target) stk)

  )

(* ------------------------------------------------------------------------ *)
(* We now discover what can be said of the structure of the stack when
   production [prod] is about to be reduced. At the same time, we
   count how many states can reduce each production and warn about
   productions that are never reduced. *)

type info =
  (* Production is never reduced. *)
  | Zero
  (* Production can be reduced at certain nodes with a certain stack structure. *)
  | More of NodeSet.t * stack

type prodinfo =
    info ProductionMap.t

let find prod prodinfo =
  try
    ProductionMap.lookup prod prodinfo
  with Not_found ->
    Zero

let prodinfo : prodinfo =
  Lr1.fold (fun prodinfo node ->
    TerminalMap.fold (fun _ prods prodinfo ->
      let prod = Misc.single prods in
      ProductionMap.add prod (
	match find prod prodinfo with
	| Zero ->
	    More (
	      NodeSet.singleton node,
	      truncate (Production.length prod) (stack node)
            )
	| More (nodes, stk') ->
	    More (
	      NodeSet.add node nodes,
	      meet (stack node) stk'
            )
      ) prodinfo
    ) (Lr1.reductions node) prodinfo
  ) ProductionMap.empty

let () =
  let count = ref 0 in
  Production.iter (fun prod ->
    match find prod prodinfo, Production.classify prod with
    | Zero, Some nt ->
	incr count;
	Error.warningN
	  (Nonterminal.positions nt)
	  (Printf.sprintf "symbol %s is never accepted." (Nonterminal.print false nt))
    | Zero, None ->
	incr count;
	Error.warningN
	  (Production.positions prod)
	  (Printf.sprintf "production %sis never reduced." (Production.print prod))
    | More (_, (w, _)), _ ->
	assert (List.length w = Production.length prod)
  );
  if !count > 0 then
    Error.warning
      (Printf.sprintf "in total, %d productions are never reduced." !count)

let prodstack prod =
  match find prod prodinfo with
  | Zero ->
      assert false
  | More (_, stk) ->
      stk

(* ------------------------------------------------------------------------ *)
(* We now determine which states must be represented, that is,
   explicitly pushed onto the stack. For simplicity, a state is either
   always represented or never represented. More fine-grained
   strategies, where a single state is sometimes pushed onto the stack
   and sometimes not pushed, depending on which outgoing transition is
   being taken, are conceivable, but quite tricky, and probably not
   worth the trouble.

   (1) If two states are liable to appear within a single stack cell,
   then one is represented if and only if the other is
   represented. This ensures that the structure of stacks is known
   everywhere and that we can propose types for stacks.

   (2) If a state [s] has an outgoing transition along nonterminal
   symbol [nt], and if the [goto] table for symbol [nt] has more than
   one target, then state [s] is represented.

   (3) If a stack cell contains more than one state and if at least
   one if these states is able to handle the [error] token, then these
   states are represented.

   (4) If the semantic action associated with a production mentions
   the [$syntaxerror] keyword, then the state that is being reduced to
   (that is, the state that initiated the recognition of this
   production) is represented. (Indeed, it will be passed as an
   argument to [errorcase].) *)

(* Data. *)

let rep : bool UnionFind.point array =
  Array.init Lr1.n (fun _ -> UnionFind.fresh false)

(* Getter. *)

let represented state =
  rep.(Lr1.number state)

(* Setters. *)

let represent state =
  UnionFind.change (represented state) true

let represents states =
  represent (NodeSet.choose states)

(* Enforce condition (1) above. *)

let share (w, _) =
  List.iter (fun (_, states) ->
    let dummy = UnionFind.fresh false in
    NodeSet.iter (fun state ->
      UnionFind.eunion dummy (represented state)
    ) states
  ) w

let () =
  Array.iter share stacks;
  Production.iter (fun prod ->
    match find prod prodinfo with
    | Zero ->
	()
    | More (_, stk) ->
	share stk
  )

(* Enforce condition (2) above. *)

let () =
  Nonterminal.iter (fun nt ->
    let count = 
      Lr1.targets (fun count _ _ ->
	count + 1
      ) 0 (Symbol.N nt)
    in
    if count > 1 then
      Lr1.targets (fun () sources _ ->
	List.iter represent sources
      ) () (Symbol.N nt)
  )

(* Enforce condition (3) above. *)

let handler state =
  try
    let _ = SymbolMap.find (Symbol.T Terminal.error) (Lr1.transitions state) in
    true
  with Not_found ->
    try
      let _ = TerminalMap.lookup Terminal.error (Lr1.reductions state) in
      true
    with Not_found ->
      false

let handlers states =
  NodeSet.exists handler states

let () =
  Array.iter (fun (w, _) ->
    List.iter (fun (_, states) ->
      if NodeSet.cardinal states >= 2 && handlers states then
	represents states
    ) w
  ) stacks

(* Enforce condition (4) above. *)

let () =
  Production.iterx (fun prod ->
    if Action.has_syntaxerror (Production.action prod) then
      match find prod prodinfo with
      | Zero ->
	  ()
      | More (sites, (w, _)) ->
	  let length = Production.length prod in
	  if length = 0 then
	    NodeSet.iter represent sites
	  else
	    let (_, states) = List.nth w (length - 1) in
	    represents states
  )

(* Define accessors. *)

let represented state =
  UnionFind.find (represented state)

let representeds states =
  if NodeSet.is_empty states then
    assert false
  else
    represented (NodeSet.choose states)

let representedc (_, states) =
  representeds states

let handlerc (_, states) =
  handlers states

let fold f accu w =
  List.fold_right (fun (symbol, states) accu ->
    f accu (representeds states) symbol
  ) w accu

let fold_top f accu w =
  match w with
  | [] ->
      accu
  | (symbol, states) :: _ ->
      f (representeds states) symbol

let () =
  Error.logC 1 (fun f ->
    let count =
      Lr1.fold (fun count node ->
        if represented node then count + 1 else count
      ) 0
    in
    Printf.fprintf f "%d out of %d states are represented.\n" count Lr1.n
  )

(* ------------------------------------------------------------------------ *)
(* Explain how the stack should be deconstructed when an error is
   found.

   We sometimes have a choice as too how many stack cells should be
   popped. Indeed, several cells in the known suffix of the stack may
   physically hold a state. If neither of these states handles errors,
   then we could jump to either. (Indeed, if we jump to one that's
   nearer, it will in turn pop further stack cells and jump to one
   that's farther.) In the interests of code size, we should pop as
   few stack cells as possible. So, we jump to the topmost represented
   state in the known suffix. *)

type state =
  | Represented
  | UnRepresented of Lr1.node

type instruction =
  | Die
  | DownTo of word * state

let rewind node : instruction =
  let w, tail = stack node in

  let rec rewind w =
    match w, tail with
    | [], TailEmpty ->
	Die
    | [], TailUnknown ->

	(* I believe that every stack description either is definite
	   (that is, ends with [TailEmpty]) or contains at least one
	   represented state. This property, if true, ensures that
	   this assertion cannot fail. *)

        (* TEMPORARY prove this property. If the property is not true in
           general, one could make it true by making more states
           represented. *)

	assert false

    | cell :: w, _ ->

	if representedc cell then

	  (* Here is a represented state. We will pop this
	     cell and no more. *)

	  DownTo ([ cell ], Represented)

	else if handlerc cell then

	  (* Here is an unrepresented state that can handle
	     errors. The cell must hold a singleton set of states, so
	     we know which state to jump to, even though it isn't
	     represented. *)

	  let (_, states) = cell in
	  assert (NodeSet.cardinal states = 1);
	  let state = NodeSet.choose states in
	  DownTo ([ cell ], UnRepresented state)

	else

	  (* Here an unrepresented state that does not handle
	     errors. Pop this cell and look further. *)

	  match rewind w with
	  | Die ->
	      Die
	  | DownTo (w, st) ->
	      DownTo (cell :: w, st)

  in
  rewind w

(* ------------------------------------------------------------------------ *)
(* Accessors for information about the stack. *)

let stack node : word =
  let (w, _) = stack node in
  w

let prodstack prod : word =
  let (w, _) = prodstack prod in
  w

let gotostack : Nonterminal.t -> word =
  Nonterminal.tabulate (fun nt ->
    let sources =
      Lr1.targets (fun accu sources _ ->
	List.fold_right NodeSet.add sources accu
      ) NodeSet.empty (Symbol.N nt)
    in
    [ Symbol.N nt, sources ]
  )

(* ------------------------------------------------------------------------ *)
(* We now determine which positions must be kept track of. For
   simplicity, we do this on a per symbol basis. That is, for each
   symbol, either we never keep track of position information, or we
   always do. In fact, we do distinguish start and end positions.
   This leads to computing two sets of symbols -- those that keep
   track of their start position and those that keep track of their
   end position.

   A symbol on the right-hand side of a production must keep track of
   its (start or end) position if that position is explicitly
   requested by a semantic action.

   Furthermore, if the left-hand symbol of a production must keep
   track of its start (resp. end) position, then the first
   (resp. last) symbol of its right-hand side (if there is one) must
   do so as well. That is, unless the right-hand side is empty. *)

open Keyword

let startp =
  ref SymbolSet.empty

let endp =
  ref SymbolSet.empty

let rec require where symbol =
  let wherep =
    match where with
    | WhereStart ->
	startp
    | WhereEnd ->
	endp
  in
  if not (SymbolSet.mem symbol !wherep) then begin
    wherep := SymbolSet.add symbol !wherep;
    match symbol with
    | Symbol.T _ ->
	()
    | Symbol.N nt ->
	Production.iternt nt (require_aux where)
  end

and require_aux where prod =
  let nt, rhs = Production.def prod in
  let length = Array.length rhs in
  if length > 0 then
    match where with
    | WhereStart ->
	require where rhs.(0)
    | WhereEnd ->
	require where rhs.(length - 1)

let () =
  Production.iterx (fun prod ->
    let rhs = Production.rhs prod
    and ids = Production.identifiers prod
    and action = Production.action prod in

    KeywordSet.iter (function
      | Dollar _
      | PreviousError
      | SyntaxError ->
	  ()
      | Position (Left, where, _) ->
	  require_aux where prod
      | Position (RightDollar i, where, _) ->
	  require where rhs.(i - 1)
      | Position (RightNamed id, where, _) ->
	  Array.iteri (fun i id' ->
	    if id = id' then
	      require where rhs.(i)
	  ) ids
    ) (Action.keywords action)
  )

let startp =
  !startp

let endp =
  !endp

let () =
  Error.logC 1 (fun f ->
    Printf.fprintf f
      "%d out of %d symbols keep track of their start position.\n\
       %d out of %d symbols keep track of their end position.\n"
        (SymbolSet.cardinal startp) (Terminal.n + Nonterminal.n)
        (SymbolSet.cardinal endp) (Terminal.n + Nonterminal.n))

let startp symbol =
  SymbolSet.mem symbol startp

let endp symbol =
  SymbolSet.mem symbol endp

(* ------------------------------------------------------------------------ *)
(* Information about which productions are reduced and where. *)

let ever_reduced prod =
   match find prod prodinfo with
   | Zero ->
       false
   | More _ ->
       true

let fold_reduced f prod accu =
  match find prod prodinfo with
  | Zero ->
      accu
  | More (nodes, _) ->
      NodeSet.fold f nodes accu

(* ------------------------------------------------------------------------- *)
(* Miscellaneous. *)

let universal symbol =
  Lr1.fold (fun universal s ->
    universal && (if represented s then SymbolMap.mem symbol (Lr1.transitions s) else true)
  ) true

(* ------------------------------------------------------------------------ *)
(* Discover which states potentially can do error recovery.

   They are the states whose incoming symbol is [error]. At these
   states, [env.shifted] is zero, that is, no tokens have been
   successfully shifted since the last error token was shifted.

   We do not include in this definition the states where [env.shifted]
   *may be* zero. That would involve adding in all states reachable
   from the above states via reductions. However, error recovery will
   never be performed in these states. Indeed, imagine we shift an
   error token and enter a state that can do error recovery, according
   to the above definition. If, at this point, we consult the
   lookahead token [tok] and perform a reduction, then the new state
   that we reach is, by construction, able to act upon [tok], so no
   error recovery will be performed at that state, even though
   [env.shifted] is still zero. However, we must not perform default
   reductions at states that can do error recovery, otherwise we break
   this reasoning.

   If the option [--error-recovery] was not provided on the command
   line, then no states will perform error recovery. This makes things
   simpler (and saves some code) in the common case where people are
   not interested in error recovery. This also disables the warning
   about states that can do error recovery but do not accept the EOF
   token. *)

let recoverers =
  if Settings.recovery then
    Lr1.fold (fun recoverers node ->
      match Lr1.incoming_symbol node with
      | Some (Symbol.T tok)
	when Terminal.equal tok Terminal.error ->
	  NodeSet.add node recoverers
      | _ ->
	  recoverers
    ) NodeSet.empty
  else
    NodeSet.empty

let recoverer node =
  NodeSet.mem node recoverers

(* ------------------------------------------------------------------------ *)
(* Discover which states can peek at an error. These are the states
   where [env.shifted] may be -1, that is, where an error token may be
   on the stream. These are all states that are targets of a reduce
   action on [error]. *)

let errorpeekers =
  Lr1.fold (fun errorpeekers node ->
    try
      let prods = TerminalMap.lookup Terminal.error (Lr1.reductions node) in
      let prod = Misc.single prods in
      let nt = Production.nt prod in
      Lr1.targets (fun errorpeekers _ target ->
	NodeSet.add target errorpeekers
      ) errorpeekers (Symbol.N nt)
    with Not_found ->
      errorpeekers
  ) NodeSet.empty

let errorpeeker node =
  NodeSet.mem node errorpeekers

(* ------------------------------------------------------------------------ *)

let () =
  Time.tick "Constructing the invariant"