163
186
method clone : resolver
165
method close_all : unit
188
method active_id : resolver_id
189
(* Returns the actually used resolver ID. This is the ID passed to
190
* open_rid where unused components have been set to None. The
191
* resolver ID returned by [active_id] plays an important role when
192
* expanding relative URLs.
195
(* method close_all : unit *)
166
196
(* Closes this resolver and every clone *)
197
(* This method is no longer supported in PXP 1.2 *)
171
(* Note: resolve_general is no longer exported. In most cases, the classes
172
* resolve_read_any_channel or resolve_read_any_string are applicable, too,
173
* and much easier to configure.
177
203
(* The next classes are resolvers for concrete input sources. *)
179
class resolve_read_this_channel :
180
?id:ext_id -> ?fixenc:encoding -> ?close:(in_channel -> unit) ->
181
in_channel -> resolver;;
183
(* Reads from the passed channel (it may be even a pipe). If the ~id
184
* argument is passed to the object, the created resolver accepts only
185
* this ID. Otherwise all IDs are accepted.
186
* Once the resolver has been cloned, it does not accept any ID. This
187
* means that this resolver cannot handle inner references to external
188
* entities. Note that you can combine this resolver with another resolver
189
* that can handle inner references (such as resolve_as_file); see
190
* class 'combine' below.
205
(* CHANGES IN PXP 1.2:
207
* All resolve_read_* classes are now deprecated. The new classes
208
* resolve_to_* base on the Netchannels classes as generalization of
211
* Examples: To read from an in_channel, use:
213
* let obj_channel = new Netchannels.input_channel in_channel in
214
* new Pxp_reader.resolve_to_this_obj_channel obj_channel
216
* To read from a string, use:
218
* let obj_channel = new Netchannels.input_string string in
219
* new Pxp_reader.resolve_to_this_obj_channel obj_channel
221
* Furthermore, the new classes use the resolver_id record as generalized
222
* names for entities. This solves most problems with relative URLs.
224
* The "Anonymous" ID: In previous versions of PXP, a resolver bound to
225
* the Anonymous ID matched the Anonymous ID. This is no longer true.
226
* The algebra has been changed such that Anonymous never matches, not
229
* Example: The new resolver
230
* let r = new resolve_to_this_obj_channel ~id:Anonymous ch
231
* will never accept any ID. In contrast to this, the old, and now
232
* deprecated resolver
233
* let r' = new resolve_read_this_channel ~id:Anonymous ch
234
* accepted the ID Anonymous in previous versions of PXP.
236
* The rationale behind this change is that Anonymous acts now like
237
* an "empty set", and not like a concrete element. You can use Private
238
* to create as many concrete elements as you want, so there is actually
239
* no need for the old behaviour of Anonymous.
241
* Note that even the resolver classes provided for backwards compatibility
242
* implement this change (to limit the confusion). This means that you
243
* might have to change your application to use Private instead of
248
class resolve_to_this_obj_channel :
252
?close:(Netchannels.in_obj_channel -> unit) ->
253
Netchannels.in_obj_channel ->
256
(* Reads from the passed in_obj_channel. If the ~id or ~rid arguments
257
* are passed to the object, the created resolver accepts only
258
* these IDs (all mentioned private, system, or public IDs). Otherwise
259
* all IDs are accepted, even Anonymous.
261
* This resolver can only be used once (because the in_obj_channel
262
* can only be used once). If it is opened a second time (either
263
* in the base object or a clone), it will raise Not_competent.
191
265
* If you pass the ~fixenc argument, the encoding of the channel is
192
266
* set to the passed value, regardless of any auto-recognition or
193
267
* any XML declaration.
194
269
* When the resolver is closed, the function passed by the ~close
195
270
* argument is called. By default, the channel is closed
196
* (i.e. the default is: ~close:close_in).
200
class resolve_read_any_channel :
201
?close:(in_channel -> unit) ->
202
channel_of_id:(ext_id -> (in_channel * encoding option)) ->
271
* (i.e. the default is: ~close:(fun ch -> ch # close_in)).
275
Netchannels.in_obj_channel * encoding option * resolver_id option
276
(* When a resolver accepts an ID, this triple specifies how to proceed.
277
* The in_obj_channel is the channel to read data from, the encoding option
278
* may enforce a certain character encoding, and the resolver_id option
279
* may detail the ID (this ID will be returned by active_id).
281
* If None is passed as encoding option, the standard autodetection of
282
* the encoding is performed.
284
* If None is passed as resolver_id option, the original ID is taken
288
class resolve_to_any_obj_channel :
289
?close:(Netchannels.in_obj_channel -> unit) ->
290
channel_of_id:(resolver_id -> accepted_id) ->
206
(* resolve_read_any_channel f_open ():
207
* This resolver calls the function f_open to open a new channel for
208
* the passed ext_id. This function must either return the channel and
209
* the encoding, or it must fail with Not_competent.
210
* The function must return None as encoding if the default mechanism to
211
* recognize the encoding should be used. It must return Some e if it is
212
* already known that the encoding of the channel is e.
294
(* This resolver calls the function channel_of_id to open a new channel for
295
* the passed resolver_id. This function must either return the accepted_id,
296
* or it must fail with Not_competent.
213
298
* When the resolver is closed, the function passed by the ~close
214
299
* argument is called. By default, the channel is closed
215
* (i.e. the default is: ~close:close_in).
300
* (i.e. the default is: ~close:(fun ch -> ch # close_in)).
219
class resolve_read_url_channel :
220
?base_url:Neturl.url ->
221
?close:(in_channel -> unit) ->
222
url_of_id:(ext_id -> Neturl.url) ->
223
channel_of_url:(ext_id -> Neturl.url -> (in_channel * encoding option)) ->
304
class resolve_to_url_obj_channel :
305
?close:(Netchannels.in_obj_channel -> unit) ->
306
url_of_id:(resolver_id -> Neturl.url) ->
307
base_url_of_id:(resolver_id -> Neturl.url) ->
308
channel_of_url:(resolver_id -> Neturl.url -> accepted_id) ->
227
(* resolve_read_url_channel url_of_id channel_of_url ():
229
313
* When this resolver gets an ID to read from, it calls the function
230
* ~url_of_id to get the corresponding URL. This URL may be a relative
231
* URL; however, a URL scheme must be used which contains a path.
232
* The resolver converts the URL to an absolute URL if necessary.
233
* The second function, ~channel_of_url, is fed with the absolute URL
314
* ~url_of_id to get the corresponding URL (such IDs are normally
315
* system IDs, but it is also possible to map system IDs to URLs).
316
* This URL may be a relative URL; however, a URL scheme must be used
317
* which contains a path. The resolver converts the URL to an absolute
320
* To do so, the resolver calls ~base_url_of_id to get the URL the relative
321
* URL must be interpreted relative to. Usually, this function returns
322
* the rid_system_base as URL. This URL must be absolute.
324
* The third function, ~channel_of_url, is fed with the absolute URL
234
325
* as input. This function opens the resource to read from, and returns
235
* the channel and the encoding of the resource.
326
* the accepted_id like resolve_to_any_obj_channel does. The resolver ID
327
* passed to ~channel_of_url contains the string representation of the
328
* absolute URL as system ID.
237
330
* Both functions, ~url_of_id and ~channel_of_url, can raise
238
331
* Not_competent to indicate that the object is not able to read from
239
332
* the specified resource. However, there is a difference: A Not_competent
240
333
* from ~url_of_id is left as it is, but a Not_competent from ~channel_of_url
241
334
* is converted to Not_resolvable. So only ~url_of_id decides which URLs
242
* are accepted by the resolver and which not.
244
* The function ~channel_of_url must return None as encoding if the default
245
* mechanism to recognize the encoding should be used. It must return
246
* Some e if it is already known that the encoding of the channel is e.
335
* are accepted by the resolver and which not, and in the latter case,
336
* other resolver can be tried. If ~channel_of_url raises Not_competent,
337
* the whole resolution procedure will stop, and no other resolver will
248
340
* When the resolver is closed, the function passed by the ~close
249
341
* argument is called. By default, the channel is closed
250
* (i.e. the default is: ~close:close_in).
252
* Objects of this class contain a base URL relative to which relative
253
* URLs are interpreted. When creating a new object, you can specify
254
* the base URL by passing it as ~base_url argument. When an existing
255
* object is cloned, the base URL of the clone is the URL of the original
258
* Note that the term "base URL" has a strict definition in RFC 1808.
262
class resolve_read_this_string :
263
?id:ext_id -> ?fixenc:encoding -> string -> resolver;;
265
(* Reads from the passed string. If the ~id
266
* argument is passed to the object, the created resolver accepts only
267
* this ID. Otherwise all IDs are accepted.
268
* Once the resolver has been cloned, it does not accept any ID. This
269
* means that this resolver cannot handle inner references to external
270
* entities. Note that you can combine this resolver with another resolver
271
* that can handle inner references (such as resolve_as_file); see
272
* class 'combine' below.
273
* If you pass the ~fixenc argument, the encoding of the string is
274
* set to the passed value, regardless of any auto-recognition or
275
* any XML declaration.
279
class resolve_read_any_string :
280
string_of_id:(ext_id -> (string * encoding option)) -> unit -> resolver;;
282
(* resolver_read_any_string f_open ():
283
* This resolver calls the function f_open to get the string for
284
* the passed ext_id. This function must either return the string and
285
* the encoding, or it must fail with Not_competent.
286
* The function must return None as encoding if the default mechanism to
287
* recognize the encoding should be used. It must return Some e if it is
288
* already known that the encoding of the string is e.
342
* (i.e. the default is: ~close:(fun ch -> ch # close_in())).
439
565
* pairs (sysid, text) mapping SYSTEM identifiers to XML text (which must
440
566
* begin with <?xml ...?>).
568
* Note: SYSTEM IDs are simply compared literally, without making
569
* relative IDs absolute. See norm_system_id below for this function.
442
571
* ~fixenc: Overrides the encoding of the strings.
575
class norm_system_id : resolver -> resolver
576
(* Normalizes the SYSTEM ID, and forwards the open request to the
579
* Normalization includes:
580
* - Relative URLs are made absolute. If this fails, the problematic
581
* relative URL will be rejected.
582
* - .. and . and // in the middle of URLs are removed
583
* - Escaping of reserved characters is normalized
585
* Normalization is recommended for catalogs, e.g.
587
* (new lookup_system_id_as_file
588
* [ "http://h/p1", ...;
589
* "http://h/p2", ...;
591
* First, the catalog now even works if the URL is written in an
592
* unsual way, e.g. http://h/p1/../p2, or http://h/p%31.
593
* Second, relative URLs can be used. For instance, the document
594
* referred to as http://h/p1 can now refer to the other document
599
class rewrite_system_id :
600
?forward_unmatching_urls:bool ->
601
(string * string) list ->
604
(* Rewrites the SYSTEM URL according to the list of pairs. The left
605
* component is the pattern, the right component is the substitute.
608
* new rewrite_system_id
609
* [ "http://host/foo/", "file:///dir/" ]
612
* rewrites all URLs beginning with http://host/foo/ to file:///dir/,
613
* e.g. http://host/foo/x becomes file:///dir/x.
615
* If the pattern ends with a slash (as in the example), a prefix match
616
* is performed, i.e. the whole directory hierarchy is rewritten.
617
* If the pattern does not end with a slash, an exact match is performed,
618
* i.e. only a single URL is rewritten.
620
* The class normalizes URLs as norm_system_id does, before the match
623
* By default, URLs that do not match any pattern are rejected
626
* The rewritten URL is only visible within the passed subresolver.
627
* If the opened entity accesses other entities by relative URLs,
628
* these will be resolved relative to the original URL as it was before
629
* rewriting it. This gives some protection against unwanted accesses.
630
* For example, if you map http://host/contents to file:///data/contents,
631
* it will not be possible to access files outside this directory,
632
* even if tricks are used like opening ../../etc/passwd relative to
633
* http://host/contents. Of course, this protection works only if
634
* the resolver opening the file is a subresolver of rewrite_system_id.
636
* CHECK: Does this really work?
638
* Another application of this class is to use the identity as rewriting
639
* rule. This resolver
641
* new rewrite_system_id
642
* [ "file:///data/", "file:///data/" ]
643
* ( new resolve_as_file() )
645
* has the effect that only files under /data can be accessed, and
646
* other such as /etc/passwd cannot.
648
* Option ~forward_unmatching_urls: If true, URLs that do not match any
649
* pattern are forwarded to the inner resolver. These URLs are not
650
* rewritten. NOTE THAT THE MENTIONED ACCESS RESTRICTIONS USUALLY DO
651
* NOT WORK ANYMORE IF THIS OPTION IS TURNED ON.
446
655
type combination_mode =
447
656
Public_before_system (* Try public identifiers first *)
448
657
| System_before_public (* Try system identifiers first *)
453
662
?mode:combination_mode ->
457
666
(* Combines several resolver objects. If a concrete entity with an
458
667
* ext_id is to be opened, the combined resolver tries the contained
459
668
* resolvers in turn until a resolver accepts opening the entity
460
* (i.e. it does not raise Not_competent on open_in).
462
* If the ext_id is a public identifier Public(pubid,sysid), there are
669
* (i.e. it does not raise Not_competent on open_rid).
671
* If the entity to open has several names, e.g. a public name and
672
* a system name, these names are tried in parallel by default (this
673
* is possible in the PXP 1.2 model). For backward compatibility, the
674
* ~mode argument allows one to specify a different order:
464
676
* (1) Try first to open as public identifier, and if that fails,
465
* fall back to the system identifier
677
* fall back to the system identifier (Public_before_system)
466
678
* (2) Try first to open as system identifier, and if that fails,
467
* fall back to the public identifier
468
* You can select this by the ~mode argument. The default is to
469
* try public identifiers first.
679
* fall back to the public identifier (System_before_public)
471
* Clones: If the 'clone' method is invoked before 'open_in', all contained
681
* Clones: If the 'clone' method is invoked before 'open_rid', all contained
472
682
* resolvers are cloned and again combined. If the 'clone' method is
473
* invoked after 'open_in' (i.e. while the resolver is open), only the
683
* invoked after 'open_rid' (i.e. while the resolver is open), only the
474
684
* active resolver is cloned.
476
* ~prefer: This is an internally used option.
479
688
(* ====================================================================== *)
690
(* TODO: The following examples recommend deprecated classes. *)
481
692
(* EXAMPLES OF RESOLVERS:
483
694
* let r1 = new resolve_as_file ()
564
775
* from_string are also applications of the Pxp_reader objects.
568
(* ======================================================================
571
* $Log: pxp_reader.mli,v $
572
* Revision 1.9 2001/07/01 08:35:23 gerd
573
* Instead of the ~auto_close argument, there is now a
574
* ~close argument for several functions/classes. This allows some
575
* additional action when the resolver is closed.
577
* Revision 1.8 2001/04/22 14:16:48 gerd
578
* resolve_as_file: you can map private IDs to arbitrary channels.
579
* resolve_read_url_channel: changed type of the channel_of_url
580
* argument (ext_id is also passed)
581
* More examples and documentation.
583
* Revision 1.7 2001/04/03 20:22:44 gerd
584
* New resolvers for catalogs of PUBLIC and SYSTEM IDs.
585
* Improved "combine": PUBLIC and SYSTEM IDs are handled
587
* Rewritten from_file: Is now a simple application of the
588
* Pxp_reader classes and functions. (The same has still to be done
591
* Revision 1.6 2001/02/01 20:38:49 gerd
592
* New support for PUBLIC identifiers.
594
* Revision 1.5 2000/07/09 01:05:33 gerd
595
* New methode 'close_all' that closes the clones, too.
597
* Revision 1.4 2000/07/08 16:24:56 gerd
598
* Introduced the exception 'Not_resolvable' to indicate that
599
* 'combine' should not try the next resolver of the list.
601
* Revision 1.3 2000/07/06 23:04:46 gerd
602
* Quick fix for 'combine': The active resolver is "prefered",
603
* but the other resolvers are also used.
605
* Revision 1.2 2000/07/04 22:06:49 gerd
606
* MAJOR CHANGE: Complete redesign of the reader classes.
608
* Revision 1.1 2000/05/29 23:48:38 gerd
609
* Changed module names:
610
* Markup_aux into Pxp_aux
611
* Markup_codewriter into Pxp_codewriter
612
* Markup_document into Pxp_document
613
* Markup_dtd into Pxp_dtd
614
* Markup_entity into Pxp_entity
615
* Markup_lexer_types into Pxp_lexer_types
616
* Markup_reader into Pxp_reader
617
* Markup_types into Pxp_types
618
* Markup_yacc into Pxp_yacc
619
* See directory "compatibility" for (almost) compatible wrappers emulating
620
* Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
622
* ======================================================================
623
* Old logs from markup_reader.mli:
625
* Revision 1.3 2000/05/29 21:14:57 gerd
626
* Changed the type 'encoding' into a polymorphic variant.
628
* Revision 1.2 2000/05/20 20:31:40 gerd
629
* Big change: Added support for various encodings of the
630
* internal representation.
632
* Revision 1.1 2000/03/13 23:41:54 gerd
778
(**********************************************************************)
779
(* DEPRECATED CLASSES *)
780
(**********************************************************************)
782
class resolve_read_this_channel :
783
?id:ext_id -> ?fixenc:encoding -> ?close:(in_channel -> unit) ->
784
in_channel -> resolver;;
786
(* THIS CLASS IS DEPRECATED! USE resolve_to_this_obj_channel INSTEAD!
789
(* Reads from the passed channel (it may be even a pipe). If the ~id
790
* argument is passed to the object, the created resolver accepts only
791
* this ID (except Anonymous). Otherwise all IDs are accepted, even
793
* Once the resolver has been cloned, it does not accept any ID. This
794
* means that this resolver cannot handle inner references to external
795
* entities. Note that you can combine this resolver with another resolver
796
* that can handle inner references (such as resolve_as_file); see
797
* class 'combine' below.
798
* If you pass the ~fixenc argument, the encoding of the channel is
799
* set to the passed value, regardless of any auto-recognition or
800
* any XML declaration.
801
* When the resolver is closed, the function passed by the ~close
802
* argument is called. By default, the channel is closed
803
* (i.e. the default is: ~close:close_in).
807
class resolve_read_any_channel :
808
?close:(in_channel -> unit) ->
809
channel_of_id:(ext_id -> (in_channel * encoding option)) ->
813
(* THIS CLASS IS DEPRECATED! USE resolve_to_any_obj_channel INSTEAD!
815
* Note: The function channel_of_id may be called several times to find
816
* out the right ext_id from the current resolver_id. The first result
817
* is taken that is not Not_competent.
820
(* resolve_read_any_channel f_open ():
821
* This resolver calls the function f_open to open a new channel for
822
* the passed ext_id. This function must either return the channel and
823
* the encoding, or it must fail with Not_competent.
824
* The function must return None as encoding if the default mechanism to
825
* recognize the encoding should be used. It must return Some e if it is
826
* already known that the encoding of the channel is e.
827
* When the resolver is closed, the function passed by the ~close
828
* argument is called. By default, the channel is closed
829
* (i.e. the default is: ~close:close_in).
832
class resolve_read_url_channel :
833
?base_url:Neturl.url ->
834
?close:(in_channel -> unit) ->
835
url_of_id:(ext_id -> Neturl.url) ->
836
channel_of_url:(ext_id -> Neturl.url -> (in_channel * encoding option)) ->
840
(* THIS CLASS IS DEPRECATED! USE resolve_to_url_obj_channel INSTEAD!
842
* Note: The function url_of_id may be called several times to find
843
* out the right ext_id from the current resolver_id. The first result
844
* is taken that is not Not_competent.
846
* Note: The optional argument base_url is ignored. The class uses always
847
* the rid_system_base string to interpret relative URLs.
850
(* resolve_read_url_channel url_of_id channel_of_url ():
852
* When this resolver gets an ID to read from, it calls the function
853
* ~url_of_id to get the corresponding URL. This URL may be a relative
854
* URL; however, a URL scheme must be used which contains a path.
855
* The resolver converts the URL to an absolute URL if necessary.
856
* The second function, ~channel_of_url, is fed with the absolute URL
857
* as input. This function opens the resource to read from, and returns
858
* the channel and the encoding of the resource.
860
* Both functions, ~url_of_id and ~channel_of_url, can raise
861
* Not_competent to indicate that the object is not able to read from
862
* the specified resource. However, there is a difference: A Not_competent
863
* from ~url_of_id is left as it is, but a Not_competent from ~channel_of_url
864
* is converted to Not_resolvable. So only ~url_of_id decides which URLs
865
* are accepted by the resolver and which not.
867
* The function ~channel_of_url must return None as encoding if the default
868
* mechanism to recognize the encoding should be used. It must return
869
* Some e if it is already known that the encoding of the channel is e.
871
* When the resolver is closed, the function passed by the ~close
872
* argument is called. By default, the channel is closed
873
* (i.e. the default is: ~close:close_in).
875
* [Does not apply to current implementation but to former ones:]
876
* Objects of this class contain a base URL relative to which relative
877
* URLs are interpreted. When creating a new object, you can specify
878
* the base URL by passing it as ~base_url argument. When an existing
879
* object is cloned, the base URL of the clone is the URL of the original
882
* Note that the term "base URL" has a strict definition in RFC 1808.
886
class resolve_read_this_string :
887
?id:ext_id -> ?fixenc:encoding -> string -> resolver;;
889
(* THIS CLASS IS DEPRECATED! USE resolve_to_this_obj_channel INSTEAD!
892
(* Reads from the passed string. If the ~id
893
* argument is passed to the object, the created resolver accepts only
894
* this ID (except Anonymous). Otherwise all IDs are accepted, even
896
* Once the resolver has been cloned, it does not accept any ID. This
897
* means that this resolver cannot handle inner references to external
898
* entities. Note that you can combine this resolver with another resolver
899
* that can handle inner references (such as resolve_as_file); see
900
* class 'combine' below.
901
* If you pass the ~fixenc argument, the encoding of the string is
902
* set to the passed value, regardless of any auto-recognition or
903
* any XML declaration.
907
class resolve_read_any_string :
908
string_of_id:(ext_id -> (string * encoding option)) -> unit -> resolver;;
910
(* THIS CLASS IS DEPRECATED! USE resolve_to_any_obj_channel INSTEAD!
913
(* resolver_read_any_string f_open ():
914
* This resolver calls the function f_open to get the string for
915
* the passed ext_id. This function must either return the string and
916
* the encoding, or it must fail with Not_competent.
917
* The function must return None as encoding if the default mechanism to
918
* recognize the encoding should be used. It must return Some e if it is
919
* already known that the encoding of the string is e.
922
val lookup_public_id_as_file :
924
(string * string) list -> (* catalog *)
926
(* Same as the equally named class *)
928
val lookup_public_id_as_string :
930
(string * string) list -> (* catalog *)
932
(* Same as the equally named class *)
934
val lookup_system_id_as_file :
936
(string * string) list -> (* catalog *)
938
(* Same as the equally named class *)
940
val lookup_system_id_as_string :
942
(string * string) list -> (* catalog *)
944
(* Same as the equally named class *)