1
(* $Id: sample.ml 662 2004-05-25 20:57:28Z gerd $
2
* ----------------------------------------------------------------------
7
(**********************************************************************)
8
(* Examples for event-based parsing ("SAX-like parsing") *)
9
(**********************************************************************)
20
(* dump_event: dumps a single parsing event *)
24
E_start_doc(v,sa,dtd) ->
25
printf "E_start_doc version=%s standalone=%b\n" v sa
28
| E_start_tag(name,attlist,_) ->
29
printf "E_start_tag %s %s\n" name
30
(String.concat " " (List.map (fun (n,v) -> n ^ "=" ^ v) attlist))
31
| E_end_tag(name,_) ->
32
printf "E_end_tag %s\n" name
34
printf "E_char_data %s\n" data
35
| E_pinstr(target,data) ->
36
printf "E_pinstr %s %s\n" target data
38
printf "E_comment %s\n" data
39
| E_position(ent,line,col) ->
40
printf "E_position %s line=%d col=%d\n" ent line col
42
printf "E_error %s\n" (Printexc.to_string e)
44
printf "E_end_of_stream\n"
48
(* parse: prints the events while parsing the passed string *)
54
(create_entity_manager default_config (from_string s))
60
(* curly_parse: demonstrates how to use escape_contents. The character
61
* { escapes from normal parsing and calls [escape]. The arithmetic
62
* expression inside { ... } is evaluated, and the result is taken
63
* as the text content.
64
* Try: curly_parse "<a>{123 + 5}</a>"
65
* curly_parse "<a>{{123 + 5}}</a>"
66
* curly_parse "<a>{123 + 5</a>}</a>"
69
let inc_col (l,c) = (l,c+1);;
70
let inc_line (l,c) = (l+1,0);;
71
let add_col n (l,c) = (l,c+n);;
76
(* We get now the current lexical buffer of PXP, and use it for
77
* our own parsing. In particular, we call Expr.topexpr
78
* to parse the arithmetic expression. While parsing,
79
* we track the current line and column (function [scan]).
81
let line_col = ref (mng # current_line_column) in
82
(* Note: current_line_column returns the position of the beginning of
83
* the token that has been parsed last. The last token was "{" (Lcurly).
84
* So we must add 1 to this position to get the position of the
85
* beginning of the next token.
87
line_col := inc_col !line_col;
89
match scan_expr buf with
91
line_col := inc_line !line_col;
94
let n = Lexing.lexeme_end buf - Lexing.lexeme_start buf in
95
line_col := add_col n !line_col;
98
let n = Lexing.lexeme_end buf - Lexing.lexeme_start buf in
99
line_col := add_col n !line_col;
102
let lexbuf = mng # current_lexbuf in
103
let value = topexpr scan lexbuf in
104
printf "Result of expression: %d\n" value;
105
mng # update_line_column !line_col;
109
let escape_contents tok mng =
110
(* This function is called when "{", "{{", "}", or "}}" are found in
111
* character node context.
118
(* "{{" found: map to "{" *)
122
failwith "Single brace } not allowed"
124
(* "}}" found: map to "}" *)
129
let escape_attributes tok pos mng =
130
(* This function is called when "{", "{{", "}", or "}}" are found in
142
failwith "Single brace } not allowed"
150
let config = { default_config with
151
escape_contents = Some escape_contents;
152
escape_attributes = Some escape_attributes;
157
(create_entity_manager config (from_string s))
163
(* rec_curly_parse: Here, escape_contents calls the XML parser recursively,
164
* i.e. you can write XML documents inside curly braces, like in
165
* rec_curly_parse "<A> { <B> x </B> } </A>" or
166
* rec_curly_parse "<A att='{ <B> x </B> }'>y</A>"
168
* This is currently very experimental!
171
class any_entity_id = object end ;;
172
(* An entity ID is an object without properties except identity *)
174
let rec_curly_parse s =
175
let ent_id_guard = new any_entity_id in
176
let base_config = default_config in
178
let rec escape ent_id tok mng =
179
(* ent_id: is the entity ID containing the last Lcurly, or ent_id_guard
180
* when there was none yet
182
let current_ent = mng # current_entity in
183
let current_ent_id = (current_ent :> entity_id) in
186
printf "Starting subparser...\n";
187
(* Because [current_ent] is already open, we cannot use it as
188
* top-level entity in [process_entity] (it is not possible to
189
* open an entity several times). The solution is [sub_ent],
190
* a so-called entity section that behaves much like [current_ent]
191
* and shares most of the state with [current_ent], but pretends
192
* it were an entity of its own.
194
let sub_ent = new Pxp_entity.entity_section current_ent in
195
let sub_ent_id = (sub_ent :> entity_id) in
198
escape_contents = Some (escape sub_ent_id) ;
199
escape_attributes = Some (escape_att sub_ent_id) ;
202
(* Pushing sub_ent makes it the top-level entity: *)
203
mng # push_entity sub_ent;
209
assert(mng # current_entity = sub_ent);
210
(* Pop sub_ent afterwards: *)
215
if ent_id = ent_id_guard then
216
(* A right curly brace without matching left curly brace *)
217
failwith "Single brace } not allowed"
219
if ent_id = current_ent_id then (
220
(* The normal case: *)
221
printf "Stopping subparser...\n";
222
ignore(current_ent # close_entity);
224
(* Causes that the current [process_entity] parser invocation
225
* terminates (if possible)
229
(* ent_id <> current_ent_id: This can happen if entities and
230
* braces are used in strange ways:
231
* <!DOCTYPE a [ <!ENTITY brace '}'> ]>
232
* <a> { <b>xxx</b> &brace; </a>
234
failwith "Bad nesting of entities and braces {...}"
238
and escape_att ent_id tok pos mng = escape ent_id tok mng
242
escape_contents = Some (escape ent_id_guard);
243
escape_attributes = Some (escape_att ent_id_guard);
249
(create_entity_manager config (from_string s))
255
(* parse_expr: An example for process_expr that parses the expressions
256
* found in a string one after another.
258
* parse_expr "<?pi?><abc>def</abc> <qrt>def</qrt> "
260
* Unfortunately, we need the undocumented methods [open_entity] and
261
* [close_entity] from [Pxp_entity], and we need the knowledge that
262
* a [Begin_entity] token can be found at the beginning of the entity,
263
* and that an [End_entity] token signals the end of the entity. Using
264
* [process_expr] is quite low-level.
269
{ default_config with
270
enable_pinstr_nodes = true;
271
enable_comment_nodes = true
273
let m = create_entity_manager config (from_string s) in
274
m # current_entity # open_entity true Pxp_lexer_types.Content;
275
let begin_entity_token = !(m # yy_get_next_ref)() in
276
assert (begin_entity_token = Begin_entity);
277
let tok = ref Ignore in (* Ignore does not occur in the token stream *)
278
while !tok <> End_entity do
280
if !tok <> Ignore then Some !tok else None in
281
printf "*** Calling process_expr...\n";
282
process_expr ?first_token ~following_token:tok config m dump_event;
284
ignore(m # current_entity # close_entity);