1
(******************************************************************************
4
* Copyright (C) 2005- Jane Street Holding, LLC *
5
* Contact: opensource@janestreet.com *
6
* WWW: http://www.janestreet.com/ocaml *
7
* Author: Markus Mottl *
9
* This library is free software; you can redistribute it and/or *
10
* modify it under the terms of the GNU Lesser General Public *
11
* License as published by the Free Software Foundation; either *
12
* version 2 of the License, or (at your option) any later version. *
14
* This library is distributed in the hope that it will be useful, *
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *
17
* Lesser General Public License for more details. *
19
* You should have received a copy of the GNU Lesser General Public *
20
* License along with this library; if not, write to the Free Software *
21
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *
23
******************************************************************************)
25
1
(* Sexp: Module for handling S-expressions (I/O, etc.) *)
148
127
let to_buffer = to_buffer_mach
129
let to_buffer_gen ~buf ~add_char ~add_string sexp =
130
let rec loop may_need_space = function
132
let str' = maybe_esc_str str in
133
let new_may_need_space = str' == str in
134
if may_need_space && new_may_need_space then add_char buf ' ';
139
let may_need_space = loop false h in
140
loop_rest may_need_space t;
142
| List [] -> add_string buf "()"; false
143
and loop_rest may_need_space = function
145
let may_need_space = loop may_need_space h in
146
loop_rest may_need_space t
147
| [] -> add_char buf ')' in
148
ignore (loop false sexp)
151
151
(* Output of S-expressions to I/O-channels *)
153
let buffer () = Buffer.create 4096
153
(* The maximum size of a thing on the minor heap is 256 words.
154
Previously, this size of the returned buffer here was 4096 bytes, which
155
caused the Buffer to be allocated on the *major* heap every time.
157
According to a simple benchmark by Ron, we can improve performance for
158
small s-expressions by a factor of ~4 if we only allocate 1024 bytes
159
(128 words + some small overhead) worth of buffer initially. And one
160
can argue that if it's free to allocate strings smaller than 256 words,
161
large s-expressions requiring larger expensive buffers won't notice
162
the extra two doublings from 1024 bytes to 2048 and 4096. And especially
163
performance-sensitive applications to always pass in a larger buffer to
165
let buffer () = Buffer.create 1024
155
167
let with_new_buffer oc f =
156
168
let buf = buffer () in
172
184
(* Output of S-expressions to file *)
186
(* The temp file functions in the OCaml Filename module do not support
187
permissions. But opening a file with given permissions is different
188
from opening it and chmoding it to these permissions, because the umask
189
is taken in account. Under Unix there's no easy way to get the umask in
190
a thread-safe way. *)
191
module Tmp_file = struct
194
let temp_file_name prefix suffix =
195
let rand_state = match !prng with
198
let ret = Random.State.make_self_init () in
202
let rnd = (Random.State.bits rand_state) land 0xFFFFFF in
203
Printf.sprintf "%s%06x%s" prefix rnd suffix
205
(* Keep the permissions loose. Sexps are usually shared and rarely private*)
206
let open_temp_file ?(perm = 0o666) prefix suffix =
207
let rec try_name counter =
208
let name = temp_file_name prefix suffix in
211
open_out_gen [Open_wronly; Open_creat; Open_excl; Open_text] perm name
214
with Sys_error _ as e ->
215
if counter >= 1000 then raise e else try_name (counter + 1)
174
220
let save_of_output ?perm output_function file sexp =
177
if Filename.is_relative file then "."
178
else Filename.dir_sep
180
Filename.open_temp_file ~temp_dir file "tmp"
183
output_function oc sexp;
189
let umask = Unix.umask 0 in
190
ignore (Unix.umask umask);
193
if perm <> 0o600 then Unix.chmod tmp_name perm;
194
Sys.rename tmp_name file
221
let tmp_name, oc = Tmp_file.open_temp_file ?perm file "tmp" in
224
output_function oc sexp;
197
227
close_out_noerr oc;
228
begin try Sys.remove tmp_name with _ -> () end;
231
Sys.rename tmp_name file
200
233
let output_sexp_nl do_output oc sexp =
201
234
do_output oc sexp;
378
429
parse_pos.Parse_pos.buf_pos <- buf_pos;
379
430
parse_pos.Parse_pos.global_offset <- parse_pos.Parse_pos.global_offset + len
381
let mk_parse_pos { parse_pos } buf_pos =
432
let mk_parse_pos { parse_pos; _ } buf_pos =
382
433
set_parse_pos parse_pos buf_pos;
385
436
let raise_parse_error parse_state location buf_pos err_msg =
387
match parse_state with
388
| `Sexp { parse_pos } | `Annot { parse_pos } ->
389
set_parse_pos parse_pos buf_pos;
390
parse_pos.Parse_pos.text_char <- parse_pos.Parse_pos.text_char + 1;
392
let parse_error = { location; err_msg; parse_state } in
393
raise (Parse_error parse_error)
437
match parse_state with
438
| `Sexp { parse_pos; _ } | `Annot { parse_pos; _ } ->
439
set_parse_pos parse_pos buf_pos;
440
let parse_error = { location; err_msg; parse_state } in
441
raise (Parse_error parse_error)
395
443
let raise_unexpected_char parse_state location buf_pos c =
396
444
let err_msg = sprintf "unexpected character: '%c'" c in
397
445
raise_parse_error parse_state location buf_pos err_msg
399
(* The code below is derived from lexer.mll in the OCaml distribution,
400
which also uses ASCII codes instead of escape sequences to denote
401
special characters. *)
447
let mk_cont_parser cont_parse = (); fun _state str ~max_pos ~pos ->
448
let len = max_pos - pos + 1 in
449
cont_parse ~pos ~len str
403
451
(* Macro for generating parsers *)
404
452
#define MK_PARSER( \
467
523
bump_pos_cont state str ~max_pos ~pos PARSE) \
468
524
| ' ' | '\009' | '\012' -> bump_pos_cont state str ~max_pos ~pos PARSE \
469
525
| '\010' -> bump_line_cont state str ~max_pos ~pos PARSE \
470
| '\013' -> bump_line_cont state str ~max_pos ~pos parse_nl \
526
| '\013' -> bump_pos_cont state str ~max_pos ~pos parse_nl \
471
527
| ';' -> bump_pos_cont state str ~max_pos ~pos parse_comment \
474
530
bump_pos_cont state str ~max_pos ~pos parse_quoted \
477
add_bump_pos state str ~max_pos ~pos c parse_atom \
535
| '#' -> maybe_parse_comment \
536
| '|' -> maybe_parse_close_comment \
539
add_bump_pos state str ~max_pos ~pos c parse \
479
541
and parse_nl state str ~max_pos ~pos = \
480
542
if pos > max_pos then mk_cont "parse_nl" parse_nl state \
482
let pos = if GET_CHAR = '\010' then pos + 1 else pos in \
483
PARSE state str ~max_pos ~pos \
544
let c = GET_CHAR in \
545
if c = '\010' then bump_line_cont state str ~max_pos ~pos PARSE \
546
else raise_unexpected_char (MK_PARSE_STATE state) "parse_nl" pos c \
485
548
and parse_comment state str ~max_pos ~pos = \
486
549
if pos > max_pos then mk_cont "parse_comment" parse_comment state \
488
551
match GET_CHAR with \
489
552
| '\010' -> bump_line_cont state str ~max_pos ~pos PARSE \
490
| '\013' -> bump_line_cont state str ~max_pos ~pos parse_nl \
553
| '\013' -> bump_pos_cont state str ~max_pos ~pos parse_nl \
491
554
| _ -> bump_pos_cont state str ~max_pos ~pos parse_comment \
556
and maybe_parse_comment state str ~max_pos ~pos = \
557
if pos > max_pos then \
558
mk_cont "maybe_parse_comment" maybe_parse_comment state \
560
match GET_CHAR with \
561
| ';' -> bump_pos_cont state str ~max_pos ~pos parse_sexp_comment \
562
| '|' -> bump_pos_cont state str ~max_pos ~pos parse_block_comment \
563
| _ -> parse_atom state str ~max_pos ~pos \
565
and maybe_parse_close_comment state str ~max_pos ~pos = \
566
if pos > max_pos then \
567
mk_cont "maybe_parse_close_comment" maybe_parse_close_comment state \
569
if GET_CHAR <> '#' then parse_atom state str ~max_pos ~pos \
571
let err_msg = "end of block comment without start" in \
572
raise_parse_error (MK_PARSE_STATE state) \
573
"maybe_parse_close_comment" pos err_msg \
575
and parse_sexp_comment state str ~max_pos ~pos = \
576
let pbuf_str = "" in \
578
Buffer.clear state.pbuf; \
579
let old_pstack = GET_PSTACK in \
582
let rec loop parse state str ~max_pos ~pos = \
583
match parse state str ~max_pos ~pos with \
584
| Done (_sexp, { Parse_pos.buf_pos = pos; _ }) -> \
585
Buffer.clear state.pbuf; \
586
let pstack = old_pstack in \
588
PARSE state str ~max_pos ~pos \
589
| Cont (_, cont_parse) -> \
590
Buffer.clear state.pbuf; \
591
let parse = mk_cont_parser cont_parse in \
592
mk_cont_state "parse_sexp_comment" (loop parse) state \
593
~cont_state:Cont_state.Parsing_sexp_comment \
595
loop PARSE state str ~max_pos ~pos \
597
and parse_block_comment state str ~max_pos ~pos = \
598
let pbuf_str = "" in \
600
Buffer.clear state.pbuf; \
601
let old_pstack = GET_PSTACK in \
604
let rec loop depth state str ~max_pos ~pos = \
605
let rec parse_block_depth state str ~max_pos ~pos = \
606
if pos > max_pos then \
607
mk_cont "parse_block_depth" parse_block_depth state \
609
match GET_CHAR with \
610
| '\010' -> bump_line_cont state str ~max_pos ~pos parse_block_depth \
613
let rec parse_block_quote parse state str ~max_pos ~pos = \
614
match parse state str ~max_pos ~pos with \
615
| Done (_sexp, { Parse_pos.buf_pos = pos; _ }) -> \
616
Buffer.clear state.pbuf; \
617
parse_block_depth state str ~max_pos ~pos \
618
| Cont (_, cont_parse) -> \
619
Buffer.clear state.pbuf; \
620
let parse = mk_cont_parser cont_parse in \
621
mk_cont_state "parse_block_quote" \
622
(parse_block_quote parse) state \
623
~cont_state:Cont_state.Parsing_block_comment \
625
bump_pos_cont state str ~max_pos ~pos \
626
(parse_block_quote parse_quoted) \
627
| '#' -> bump_pos_cont state str ~max_pos ~pos parse_open_block \
628
| '|' -> bump_pos_cont state str ~max_pos ~pos parse_close_block \
629
| _ -> bump_pos_cont state str ~max_pos ~pos parse_block_depth \
630
and parse_open_block state str ~max_pos ~pos = \
631
if pos > max_pos then \
632
mk_cont "parse_open_block" parse_open_block state \
634
if GET_CHAR = '|' then \
635
bump_pos_cont state str ~max_pos ~pos (loop (depth + 1)) \
636
else parse_block_depth state str ~max_pos ~pos \
637
and parse_close_block state str ~max_pos ~pos = \
638
if pos > max_pos then \
639
mk_cont "parse_close_block" parse_close_block state \
640
else if GET_CHAR = '#' then \
643
let () = Buffer.clear state.pbuf in \
644
let pstack = old_pstack in \
647
else loop (depth - 1) \
649
bump_pos_cont state str ~max_pos ~pos parse \
650
else parse_block_depth state str ~max_pos ~pos \
652
parse_block_depth state str ~max_pos ~pos \
654
loop 1 state str ~max_pos ~pos \
493
656
and parse_atom state str ~max_pos ~pos = \
494
657
if pos > max_pos then mk_cont "parse_atom" parse_atom state \
496
659
match GET_CHAR with \
497
660
| ' ' | '\009' | '\012' -> \
498
661
bump_found_atom bump_text_pos state str ~max_pos ~pos PARSE \
663
add_bump_pos state str ~max_pos ~pos c maybe_parse_bad_atom_hash \
665
add_bump_pos state str ~max_pos ~pos c maybe_parse_bad_atom_pipe \
500
667
let pbuf = state.pbuf in \
501
668
let pbuf_str = Buffer.contents pbuf in \
536
703
bump_text_pos state str ~max_pos ~pos reg_parse_quoted \
537
704
| c -> add_bump_pos state str ~max_pos ~pos c parse_atom \
706
and maybe_parse_bad_atom_pipe state str ~max_pos ~pos = \
707
if pos > max_pos then \
708
mk_cont "maybe_parse_bad_atom_pipe" maybe_parse_bad_atom_pipe state \
710
match GET_CHAR with \
712
let err_msg = "illegal end of block comment in unquoted atom" in \
713
raise_parse_error (MK_PARSE_STATE state) "maybe_parse_bad_atom_pipe" \
715
| _ -> parse_atom state str ~max_pos ~pos \
717
and maybe_parse_bad_atom_hash state str ~max_pos ~pos = \
718
if pos > max_pos then \
719
mk_cont "maybe_parse_bad_atom_hash" maybe_parse_bad_atom_hash state \
721
match GET_CHAR with \
723
let err_msg = "illegal start of block comment in unquoted atom" in \
724
raise_parse_error (MK_PARSE_STATE state) "maybe_parse_bad_atom_hash" \
726
| _ -> parse_atom state str ~max_pos ~pos \
539
728
and reg_parse_quoted state str ~max_pos ~pos = \
541
730
parse_quoted state str ~max_pos ~pos \
542
732
and parse_quoted state str ~max_pos ~pos = \
543
733
if pos > max_pos then mk_cont "parse_quoted" parse_quoted state \
680
860
parse_pos.Parse_pos.global_offset + pos - parse_pos.Parse_pos.buf_pos
683
({ Parse_pos.text_line = line; text_char = col } as parse_pos) pos =
863
({ Parse_pos.text_line = line; text_char = col; _ } as parse_pos) pos =
684
864
{ Annot.line; col; offset = get_glob_ofs parse_pos pos }
686
866
let mk_annot_pos1
687
({ Parse_pos.text_line = line; text_char = col } as parse_pos) pos =
867
({ Parse_pos.text_line = line; text_char = col; _ } as parse_pos) pos =
688
868
{ Annot.line; col = col + 1; offset = get_glob_ofs parse_pos pos }
690
let add_annot_pos { parse_pos; pstack } pos =
870
let add_annot_pos { parse_pos; pstack; pbuf = _ } pos =
691
871
pstack.Annot.positions <- mk_annot_pos parse_pos pos :: pstack.Annot.positions
693
let add_annot_pos1 { parse_pos; pstack } pos =
873
let add_annot_pos1 { parse_pos; pstack; pbuf = _ } pos =
694
874
pstack.Annot.positions <-
695
875
mk_annot_pos1 parse_pos pos :: pstack.Annot.positions
697
let get_annot_range { parse_pos; pstack } pos =
877
let get_annot_range { parse_pos; pstack; pbuf = _ } pos =
699
879
match pstack.Annot.positions with
700
880
| [] -> assert false (* impossible *)
777
957
let gen_input_rev_sexps my_parse ?parse_pos ?(buf = String.create 8192) ic =
778
958
let rev_sexps_ref = ref [] in
779
959
let buf_len = String.length buf in
780
let rec loop this_parse ~pos ~len ~is_incomplete =
960
let rec loop this_parse ~pos ~len ~cont_state =
782
962
match this_parse ~pos ~len buf with
783
| Done (sexp, ({ Parse_pos.buf_pos } as parse_pos)) ->
963
| Done (sexp, ({ Parse_pos.buf_pos; _ } as parse_pos)) ->
784
964
rev_sexps_ref := sexp :: !rev_sexps_ref;
785
965
let n_parsed = buf_pos - pos in
786
966
let this_parse = mk_this_parse ~parse_pos my_parse in
967
let cont_state = Cont_state.Parsing_whitespace in
787
968
if n_parsed = len then
788
969
let new_len = input ic buf 0 buf_len in
789
loop this_parse ~pos:0 ~len:new_len ~is_incomplete:false
792
~pos:buf_pos ~len:(len - n_parsed) ~is_incomplete:false
793
| Cont (ws_only, this_parse) ->
795
~pos:0 ~len:(input ic buf 0 buf_len) ~is_incomplete:(not ws_only)
796
else if is_incomplete then
798
"Sexplib.Sexp.input_rev_sexps: reached EOF with incomplete S-expression"
970
loop this_parse ~pos:0 ~len:new_len ~cont_state
971
else loop this_parse ~pos:buf_pos ~len:(len - n_parsed) ~cont_state
972
| Cont (cont_state, this_parse) ->
973
loop this_parse ~pos:0 ~len:(input ic buf 0 buf_len) ~cont_state
975
if cont_state = Cont_state.Parsing_whitespace then !rev_sexps_ref
978
"Sexplib.Sexp.input_rev_sexps: reached EOF while in state "
979
^ Cont_state.to_string cont_state)
801
981
let len = input ic buf 0 buf_len in
802
982
let this_parse = mk_this_parse ?parse_pos my_parse in
803
loop this_parse ~pos:0 ~len ~is_incomplete:false
983
loop this_parse ~pos:0 ~len ~cont_state:Cont_state.Parsing_whitespace
805
985
let input_rev_sexps ?parse_pos ?buf ic =
806
986
gen_input_rev_sexps parse ?parse_pos ?buf ic
825
1005
| Done (sexp, _) -> sexp
826
| Cont (ws_only, this_parse) ->
827
if ws_only then failwith (sprintf "Sexplib.Sexp.%s: whitespace only" loc);
1006
| Cont (_, this_parse) ->
828
1007
(* When parsing atoms, the incremental parser cannot tell whether
829
it is at the end until it hits whitespace. We therefore feed
830
it one space to determine whether it is finished. *)
1008
it is at the end until it hits whitespace. We therefore feed it
1009
one space to determine whether it is finished. *)
831
1010
match this_parse ~pos:0 ~len:1 ws_buf with
832
1011
| Done (sexp, _) -> sexp
1012
| Cont (cont_state, _) ->
1013
let cont_state_str = Cont_state.to_string cont_state in
835
sprintf "Sexplib.Sexp.%s: got incomplete S-expression: %s"
836
loc (get_sub str 0 (get_len str)))
1016
"Sexplib.Sexp.%s: incomplete S-expression while in state %s: %s"
1017
loc cont_state_str (get_sub str 0 (get_len str)))
838
1019
let of_string str =
839
1020
of_string_bigstring "of_string" parse " " String.length String.sub str
866
1047
let load_sexps ?buf file = List.rev (load_rev_sexps ?buf file)
1049
let gen_load_sexp_loc = "Sexplib.Sexp.gen_load_sexp"
868
1051
let gen_load_sexp my_parse ?(strict = true) ?(buf = String.create 8192) file =
869
1052
let buf_len = String.length buf in
870
1053
let ic = open_in file in
871
let rec loop this_parse =
1054
let rec loop this_parse ~cont_state =
872
1055
let len = input ic buf 0 buf_len in
874
failwith (sprintf "Sexplib.Sexp.gen_load_sexp: end of file: %s" file)
1058
sprintf "%s: EOF in %s while in state %s"
1059
gen_load_sexp_loc file (Cont_state.to_string cont_state))
876
1061
match this_parse ~pos:0 ~len buf with
877
| Done (sexp, ({ Parse_pos.buf_pos } as parse_pos))
1062
| Done (sexp, ({ Parse_pos.buf_pos; _ } as parse_pos)) when strict ->
879
1063
let rec strict_loop this_parse ~pos ~len =
880
1064
match this_parse ~pos ~len buf with
881
| Done _ | Cont (false, _) ->
884
"Sexplib.Sexp.gen_load_sexp: more than one S-expression: %s"
886
| Cont (true, this_parse) ->
1067
sprintf "%s: more than one S-expression in file %s"
1068
gen_load_sexp_loc file)
1069
| Cont (cont_state, this_parse) ->
887
1070
let len = input ic buf 0 buf_len in
889
else strict_loop this_parse ~pos:0 ~len
1071
if len > 0 then strict_loop this_parse ~pos:0 ~len
1072
else if cont_state = Cont_state.Parsing_whitespace then sexp
1075
sprintf "%s: %s in state %s loading file %s"
1076
gen_load_sexp_loc "additional incomplete data"
1077
(Cont_state.to_string cont_state) file)
891
1079
let this_parse = mk_this_parse ~parse_pos my_parse in
892
1080
strict_loop this_parse ~pos:buf_pos ~len:(len - buf_pos)
893
1081
| Done (sexp, _) -> sexp
894
| Cont (_, this_parse) -> loop this_parse
1082
| Cont (cont_state, this_parse) -> loop this_parse ~cont_state
897
let sexp = loop (mk_this_parse my_parse) in
1086
loop (mk_this_parse my_parse) ~cont_state:Cont_state.Parsing_whitespace
900
1090
with exc -> close_in_noerr ic; raise exc