1062
1199
if not is_open then
1063
1200
failwith ("Internal entity " ^ v.name ^ " not open");
1064
1201
is_open <- false;
1202
super # close_entity
1205
method is_open = is_open
1068
1208
method replacement_text =
1069
1209
if is_open then
1070
1210
raise(Validation_error("Recursive reference to entity `" ^ v.name ^ "'"));
1071
1211
replacement_text, contains_external_references
1075
(**********************************************************************)
1077
(* An 'entity_manager' is a stack of entities, where the topmost entity
1078
* is the currently active entity, the second entity is the entity that
1079
* referred to the active entity, and so on.
1081
* The entity_manager can communicate with the currently active entity.
1083
* The entity_manager provides an interface for the parser; the functions
1084
* returning the current token and the next token are exported.
1087
class entity_manager (init_entity : entity) =
1089
val mutable entity_stack = Stack.create()
1090
val mutable current_entity = init_entity
1091
val mutable current_entity's_full_name = lazy (init_entity # full_name)
1093
val mutable yy_get_next_ref = ref (fun () -> assert false)
1096
init_entity # set_manager (self :>
1097
< current_entity : entity;
1099
push_entity : entity -> unit >
1101
yy_get_next_ref := (fun () -> init_entity # next_token)
1103
method push_entity e =
1104
e # set_manager (self :>
1105
< current_entity : entity;
1107
push_entity : entity -> unit >
1109
Stack.push (current_entity, current_entity's_full_name) entity_stack;
1110
current_entity <- e;
1111
current_entity's_full_name <- lazy (e # full_name);
1112
yy_get_next_ref := (fun () -> e # next_token);
1115
(* May raise Stack.Empty *)
1116
let e, e_name = Stack.pop entity_stack in
1117
current_entity <- e;
1118
current_entity's_full_name <- e_name;
1119
yy_get_next_ref := (fun () -> e # next_token);
1123
method position_string =
1124
(* Gets a string describing the position of the last token;
1125
* includes an entity backtrace
1127
let b = Buffer.create 200 in
1129
("In entity " ^ current_entity # full_name
1130
^ ", at line " ^ string_of_int (current_entity # line)
1131
^ ", position " ^ string_of_int (current_entity # column)
1136
("Called from entity " ^ Lazy.force e_name
1137
^ ", line " ^ string_of_int (e # line)
1138
^ ", position " ^ string_of_int (e # column)
1146
(* Returns the triple (full_name, line, column) of the last token *)
1147
Lazy.force current_entity's_full_name,
1148
current_entity # line,
1149
current_entity # column
1152
method current_entity_counts_as_external =
1153
(* Whether the current entity counts as external to the main
1154
* document for the purpose of stand-alone checks.
1156
(* TODO: improve performance *)
1157
let is_external = ref false in
1159
if e # counts_as_external then begin
1160
is_external := true;
1163
check (current_entity,());
1164
Stack.iter check entity_stack;
1168
method current_entity = current_entity
1170
method yy_get_next_ref = yy_get_next_ref
1176
(* ======================================================================
1179
* $Log: pxp_entity.ml,v $
1180
* Revision 1.16 2002/03/10 23:39:45 gerd
1181
* ext_id works also for external entities.
1183
* Revision 1.15 2002/02/20 00:25:23 gerd
1184
* using Pxp_lexing instead of Lexing.
1186
* Revision 1.14 2001/06/28 22:42:07 gerd
1187
* Fixed minor problems:
1188
* - Comments must be contained in one entity
1189
* - Pxp_document.document is now initialized with encoding.
1190
* the DTD encoding may be initialized too late.
1192
* Revision 1.13 2001/04/22 14:14:41 gerd
1193
* Updated to support private IDs.
1195
* Revision 1.12 2001/04/22 12:04:55 gerd
1196
* external_entity, method replacement_text: catches errors
1197
* from pxp_reader and transforms them
1199
* Revision 1.11 2000/10/01 19:49:51 gerd
1200
* Numerous optimizations in the class "entity".
1202
* Revision 1.10 2000/09/21 21:28:16 gerd
1203
* New token IgnoreLineEnd: simplifies line counting, and
1206
* Revision 1.9 2000/09/17 00:11:22 gerd
1207
* Optimized line numbering.
1209
* Revision 1.8 2000/09/09 16:39:05 gerd
1212
* Revision 1.7 2000/09/05 21:52:31 gerd
1213
* class internal_entity: Previously, the method open_entity
1214
* intialized the slot last_token to Eof. This is wrong, because
1215
* this causes that handle_bof is never called. The slot last_token
1216
* is now initialized to Bof.
1217
* Critical negative tests: data_jclark_notwf/not-sa/002,
1218
* data_jclark_notwf/sa/153.xml, data_jclark_notwf/sa/161.xml. The
1219
* error messages of these tests changed (checked; the new messages
1222
* Revision 1.6 2000/07/14 13:55:00 gerd
1225
* Revision 1.5 2000/07/09 17:51:50 gerd
1226
* Entities return now the beginning of a token as its
1228
* New method 'position' for entity_manager.
1230
* Revision 1.4 2000/07/09 01:05:04 gerd
1231
* Exported methods 'ext_id' and 'notation' anyway.
1233
* Revision 1.3 2000/07/08 16:28:05 gerd
1234
* Updated: Exception 'Not_resolvable' is taken into account.
1236
* Revision 1.2 2000/07/04 22:12:47 gerd
1237
* Update: Case ext_id = Anonymous.
1238
* Update: Handling of the exception Not_competent when reading
1241
* Revision 1.1 2000/05/29 23:48:38 gerd
1242
* Changed module names:
1243
* Markup_aux into Pxp_aux
1244
* Markup_codewriter into Pxp_codewriter
1245
* Markup_document into Pxp_document
1246
* Markup_dtd into Pxp_dtd
1247
* Markup_entity into Pxp_entity
1248
* Markup_lexer_types into Pxp_lexer_types
1249
* Markup_reader into Pxp_reader
1250
* Markup_types into Pxp_types
1251
* Markup_yacc into Pxp_yacc
1252
* See directory "compatibility" for (almost) compatible wrappers emulating
1253
* Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
1255
* ======================================================================
1256
* Old logs from markup_entity.ml:
1258
* Revision 1.27 2000/05/29 21:14:57 gerd
1259
* Changed the type 'encoding' into a polymorphic variant.
1261
* Revision 1.26 2000/05/28 17:24:55 gerd
1264
* Revision 1.25 2000/05/27 19:23:32 gerd
1265
* The entities store whether they count as external with
1266
* respect to the standalone check: New methods counts_as_external
1267
* and set_counts_as_external.
1268
* The entity manager can find out whether the current
1269
* entity counts as external: method current_entity_counts_as_external.
1271
* Revision 1.24 2000/05/20 20:31:40 gerd
1272
* Big change: Added support for various encodings of the
1273
* internal representation.
1275
* Revision 1.23 2000/05/14 21:51:24 gerd
1276
* Change: Whitespace is handled by the grammar, and no longer
1279
* Revision 1.22 2000/05/14 17:50:54 gerd
1280
* Updates because of changes in the token type.
1282
* Revision 1.21 2000/05/09 00:02:44 gerd
1283
* Conditional sections are now recognized by the parser.
1284
* There seem some open questions; see the TODO comments!
1286
* Revision 1.20 2000/05/08 21:58:22 gerd
1287
* Introduced entity_manager as communication object between
1288
* the parser and the currently active entity.
1289
* New hooks handle_bof and handle_eof.
1290
* Removed "delegated entities". The entity manager contains
1291
* the stack of open entities.
1292
* Changed the way Begin_entity and End_entity are inserted.
1293
* This is now done by handle_bof and handle_eof.
1294
* The XML declaration is no longer detected by the entity.
1295
* This is now done by the parser.
1297
* Revision 1.19 2000/05/01 15:18:44 gerd
1298
* Improved CRLF handling in the replacement text of entities.
1299
* Changed one error message.
1301
* Revision 1.18 2000/04/30 18:18:39 gerd
1302
* Bugfixes: The conversion of CR and CRLF to LF is now hopefully
1303
* done right. The new variable "normalize_newline" indicates whether
1304
* normalization must happen for that type of entity. The normalization
1305
* if actually carried out separately for every token that needs it.
1307
* Revision 1.17 2000/03/13 23:42:38 gerd
1308
* Removed the resolver classes, and put them into their
1309
* own module (Markup_reader).
1311
* Revision 1.16 2000/02/22 01:06:58 gerd
1312
* Bugfix: Resolvers are properly re-initialized. This bug caused
1313
* that entities could not be referenced twice in the same document.
1315
* Revision 1.15 2000/01/20 20:54:11 gerd
1316
* New config.errors_with_line_numbers.
1318
* Revision 1.14 2000/01/08 18:59:03 gerd
1319
* Corrected the string resolver.
1321
* Revision 1.13 1999/09/01 22:58:23 gerd
1322
* Method warn_not_latin1 raises Illegal_character if the character
1323
* does not match the Char production.
1324
* External entities that are not document entities check if the
1325
* <?xml...?> declaration at the beginning matches the TextDecl production.
1326
* Method xml_declaration has type ... list option, not ... list.
1327
* Tag_beg and Tag_end now carry an entity_id with them.
1328
* The code to check empty entities has changed. That the Begin_entity/
1329
* End_entity pair is not to be added must be explicitly turned on. See the
1330
* description of empty entity handling in design.txt.
1331
* In internal subsets entity declarations are not allowed to refer
1332
* to parameter entities. The internal_entity class can do this now.
1333
* The p_parsed parameter of internal_entity has gone. It was simply
1336
* Revision 1.12 1999/09/01 16:24:13 gerd
1337
* The method replacement_text returns the text as described for
1338
* "included in literal". The former behaviour has been dropped to include
1339
* a leading and a trailing space character for parameter entities.
1340
* Bugfix: When general entities are included, they are always parsed.
1342
* Revision 1.11 1999/08/31 19:13:31 gerd
1343
* Added checks on proper PE nesting. The idea is that tokens such
1344
* as Decl_element and Decl_rangle carry an entity ID with them. This ID
1345
* is simply an object of type < >, i.e. you can only test on identity.
1346
* The lexer always produces tokens with a dummy ID because it does not
1347
* know which entity is the current one. The entity layer replaces the dummy
1348
* ID with the actual ID. The parser checks that the IDs of pairs such as
1349
* Decl_element and Decl_rangle are the same; otherwise a Validation_error
1352
* Revision 1.10 1999/08/19 01:06:41 gerd
1353
* Improved error messages: external entities print their
1356
* Revision 1.9 1999/08/15 20:35:48 gerd
1357
* Improved error messages.
1358
* Before the tokens Plus, Star, Qmark space is not allowed any longer.
1359
* Detection of recursive entity references is a bit cleaner.
1361
* Revision 1.8 1999/08/15 15:33:44 gerd
1362
* Revised whitespace checking: At certain positions there must be
1363
* white space. These checks cannot be part of the lexer, as %entity; counts
1364
* as white space. They cannot be part of the yacc parser because one look-ahead
1365
* token would not suffice if we did that. So these checks must be done by the
1366
* entity layer. Luckily, the rules are simple: There are simply a number of
1367
* token pairs between which white space must occur independently of where
1368
* these token have been found. Two variables, "space_seen", and "last_token"
1369
* have been added in order to check these rules.
1371
* Revision 1.7 1999/08/15 00:41:06 gerd
1372
* The [ token of conditional sections is now allowed to occur
1373
* in a different entity.
1375
* Revision 1.6 1999/08/15 00:29:02 gerd
1376
* The method "attlist_replacement_text" has gone. There is now a
1377
* more general "replacement_text" method that computes the replacement
1378
* text for both internal and external entities. Additionally, this method
1379
* returns whether references to external entities have been resolved;
1380
* this is checked in the cases where formerly "attlist_replacement_text"
1381
* was used as it is not allowed everywhere.
1382
* Entities have a new slot "need_spaces" that indicates that the
1383
* next token must be white space or a parameter reference. The problem
1384
* was that "<!ATTLIST%e;" is legal because when including parameter
1385
* entities white space is added implicitly. Formerly, the white space
1386
* was expected by the underlying lexer; now the lexer does not check
1387
* anymore that "<!ATTLIST" is followed by white space because the lexer
1388
* cannot handle parameter references. Because of this, the check on
1389
* white space must be done by the entity.
1391
* Revision 1.5 1999/08/14 22:57:19 gerd
1392
* It is allowed that external entities are empty because the
1393
* empty string is well-parsed for both declarations and contents. Empty
1394
* entities can be referenced anywhere because the references are replaced
1395
* by nothing. Because of this, the Begin_entity...End_entity brace is only
1396
* inserted if the entity is non-empty. (Otherwise references to empty
1397
* entities would not be allowed anywhere.)
1398
* As a consequence, the grammar has been changed such that a
1399
* single Eof is equivalent to Begin_entity,End_entity without content.
1401
* Revision 1.4 1999/08/14 22:11:19 gerd
1402
* Several objects have now a "warner" as argument which is
1403
* an object with a "warn" method. This is used to warn about characters
1404
* that cannot be represented in the Latin 1 alphabet.
1405
* Previously, the resolvers had features in order to warn about
1406
* such characters; this has been removed.
1407
* UTF-8 streams can be read even if they contain characters
1408
* that cannot be represented by 16 bits.
1409
* The buffering used in the resolvers is now solved in a
1410
* cleaner way; the number of characters that are expected to be read
1411
* from a source can be limited. This removes a bug with UTF-16 streams
1412
* that previously lead to wrong exceptions; and the buffering is more
1415
* Revision 1.3 1999/08/11 14:58:53 gerd
1416
* Some more names for encodings are allowed, such as "utf8" instead
1417
* of the standard name "UTF-8".
1418
* 'resolve_as_file' interprets relative file names as relative to
1419
* the "parent" resolver.
1421
* Revision 1.2 1999/08/10 21:35:07 gerd
1422
* The XML/encoding declaration at the beginning of entities is
1423
* evaluated. In particular, entities have now a method "xml_declaration"
1424
* which returns the name/value pairs of such a declaration. The "encoding"
1425
* setting is interpreted by the entity itself; "version", and "standalone"
1426
* are interpreted by Markup_yacc.parse_document_entity. Other settings
1427
* are ignored (this does not conform to the standard; the standard prescribes
1428
* that "version" MUST be given in the declaration of document; "standalone"
1429
* and "encoding" CAN be declared; no other settings are allowed).
1430
* TODO: The user should be warned if the standard is not exactly
1431
* fulfilled. -- The "standalone" property is not checked yet.
1433
* Revision 1.1 1999/08/10 00:35:51 gerd
1214
method resolver = (None : resolver option)
1219
(* An entity_section is an object that reads a section from an underlying
1220
* entity as if this section was an entity of its own. In detail, the
1221
* following rules apply:
1222
* - If a token is read from the entity_section, it is actually read from
1223
* the underlying entity (except the first and the last token). I.e.
1224
* the token stream of the entity_section and the underlying entity is
1225
* essentially the same.
1226
* - However, the entity_section has its own lexical context. The method
1227
* set_lex_id changes only the lexer ID of the entity_section, and not
1228
* of the underlying entity.
1229
* - The first token is always Begin_entity, and the last token is always
1230
* End_entity. These tokens are not taken from the underlying entity,
1231
* but simpy pretended at the beginning and at the end of the section.
1232
* - The section begins at the current position of the (open) underlying
1233
* entity when the method open_entity of the section is called. It is
1234
* an error if the underlying entity is at the beginning itself.
1235
* [TODO: The latter condition is currently not checked.]
1236
* - The section ends when the method close_entity is called. The next
1237
* token will be End_token, and then an endless sequence of Eof.
1238
* - A section cannot be opened a second time.
1239
* - Changes of encodings are ignored. (The underlying entity must do that.)
1242
type section_state = P_bof | P_normal of int | P_pre_eof | P_eof
1243
(* P_normal n: The number n is the number of open inner entities *)
1245
class entity_section (init_ent:entity) =
1248
val mutable state = P_bof
1249
val mutable is_open = false
1250
val mutable saved_lex_id = Closed
1252
method is_ndata = ent # is_ndata
1253
method name = ent # name
1254
method lex_id = ent # lex_id
1257
failwith "Pxp_entity.entity_section#set_lex_id: not open";
1259
method line = ent # line
1260
method column = ent # column
1261
method set_line_column =
1263
failwith "Pxp_entity.entity_section#set_line_column: not open";
1264
ent # set_line_column
1265
method encoding = ent # encoding
1266
method set_manager (_ : < current_entity : entity;
1267
pop_entity : unit -> unit;
1268
push_entity : entity -> unit >) = ()
1269
method counts_as_external = ent # counts_as_external
1270
method set_counts_as_external : unit =
1271
failwith "Pxp_entity.entity_section#set_counts_as_external: not possible";
1272
method lexer_obj = ent # lexer_obj
1273
method resolver = ent # resolver
1274
method resolver_id = ent # resolver_id
1275
method open_entity ?(gen_att_events:bool option) (_:bool) (lid:lexers) =
1277
failwith "Pxp_entity.entity_section#open_entity: already open";
1278
if not ent#is_open then
1279
failwith "Pxp_entity.entity_section#open_entity: Underlying entity is not open";
1280
saved_lex_id <- ent # lex_id;
1283
ent # set_lex_id lid;
1284
method close_entity =
1286
failwith "Pxp_entity.entity_section#close_entity: not open";
1288
ent # set_lex_id saved_lex_id;
1289
assert (match state with P_bof | P_normal _ -> true | _ -> false);
1290
(* CHECK: P_normal n when n>0 *)
1293
method is_open = is_open
1294
method replacement_text : (string * bool) =
1295
failwith "Pxp_entity.entity_section#replacement_text: not possible"
1296
method xml_declaration = ent # xml_declaration
1297
method set_debugging_mode = ent # set_debugging_mode
1298
method full_name = ent # full_name
1302
state <- P_normal 0;
1305
let tok = ent # next_token in
1307
* - [tok = Begin_entity] can have two reasons: [ent] has produced
1308
* [Begin_entity], or [ent] has just found an entity reference
1309
* whose entity has been opened. Because the latter is possible
1310
* we do not catch [Begin_entity] here.
1311
* - [tok = End_entity]: This token is always produced by [ent],
1312
* and so we can signal an error
1315
| (End_entity | Eof) ->
1316
raise(Error "Cannot end entity here")
1325
method next_ignored_token =
1326
(* We can ignore End_entity and Eof because the caller already signals
1327
* an error when the entity ends in an IGNORE section
1329
ent # next_ignored_token
1330
method process_xmldecl (_:prolog_token list) = ()
1331
method process_missing_xmldecl = ()
1332
method ext_id = ent # ext_id
1333
method notation = ent # notation
1337
(* class entity_manager: has been moved to Pxp_entity_manager *)