1
%% @author Bob Ippolito <bob@mochimedia.com>
2
%% @copyright 2007 Mochi Media, Inc.
4
%% @doc Loosely tokenizes and generates parse trees for HTML 4.
5
-module(mochiweb_html).
6
-export([tokens/1, parse/1, parse_tokens/1, to_tokens/1, escape/1,
7
escape_attr/1, to_html/1, test/0]).
9
% This is a macro to placate syntax highlighters..
12
-define(ADV_COL(S, N),
13
S#decoder{column=N+S#decoder.column,
14
offset=N+S#decoder.offset}).
16
S#decoder{column=1+S#decoder.column,
17
offset=1+S#decoder.offset}).
20
line=1+S#decoder.line,
21
offset=1+S#decoder.offset}).
22
-define(INC_CHAR(S, C),
26
line=1+S#decoder.line,
27
offset=1+S#decoder.offset};
29
S#decoder{column=1+S#decoder.column,
30
offset=1+S#decoder.offset}
33
-define(IS_WHITESPACE(C),
34
(C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
35
-define(IS_LITERAL_SAFE(C),
36
((C >= $A andalso C =< $Z) orelse (C >= $a andalso C =< $z)
37
orelse (C >= $0 andalso C =< $9))).
39
-record(decoder, {line=1,
43
%% @type html_node() = {string(), [html_attr()], [html_node() | string()]}
44
%% @type html_attr() = {string(), string()}
45
%% @type html_token() = html_data() | start_tag() | end_tag() | inline_html() | html_comment() | html_doctype()
46
%% @type html_data() = {data, string(), Whitespace::boolean()}
47
%% @type start_tag() = {start_tag, Name, [html_attr()], Singleton::boolean()}
48
%% @type end_tag() = {end_tag, Name}
49
%% @type html_comment() = {comment, Comment}
50
%% @type html_doctype() = {doctype, [Doctype]}
51
%% @type inline_html() = {'=', iolist()}
55
%% @spec parse(string() | binary()) -> html_node()
56
%% @doc tokenize and then transform the token stream into a HTML tree.
58
parse_tokens(tokens(Input)).
60
%% @spec parse_tokens([html_token()]) -> html_node()
61
%% @doc Transform the output of tokens(Doc) into a HTML tree.
62
parse_tokens(Tokens) when is_list(Tokens) ->
63
%% Skip over doctype, processing instructions
66
{start_tag, _, _, false} ->
72
[{start_tag, Tag, Attrs, false} | Rest] = lists:dropwhile(F, Tokens),
73
{Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
76
%% @spec tokens(StringOrBinary) -> [html_token()]
77
%% @doc Transform the input UTF-8 HTML into a token stream.
79
tokens(iolist_to_binary(Input), #decoder{}, []).
81
%% @spec to_tokens(html_node()) -> [html_token()]
82
%% @doc Convert a html_node() tree to a list of tokens.
84
to_tokens({Tag0, [], []});
85
to_tokens(T={'=', _}) ->
87
to_tokens(T={doctype, _}) ->
89
to_tokens(T={comment, _}) ->
91
to_tokens({Tag0, Acc}) ->
92
to_tokens({Tag0, [], Acc});
93
to_tokens({Tag0, Attrs, Acc}) ->
95
to_tokens([{Tag, Acc}], [{start_tag, Tag, Attrs, is_singleton(Tag)}]).
97
%% @spec to_html([html_token()] | html_node()) -> iolist()
98
%% @doc Convert a list of html_token() to a HTML document.
99
to_html(Node) when is_tuple(Node) ->
100
to_html(to_tokens(Node));
101
to_html(Tokens) when is_list(Tokens) ->
104
%% @spec escape(string() | binary()) -> string()
105
%% @doc Escape a string such that it's safe for HTML (amp; lt; gt;).
106
escape(B) when is_binary(B) ->
107
escape(binary_to_list(B), []);
108
escape(A) when is_atom(A) ->
109
escape(atom_to_list(A), []);
110
escape(S) when is_list(S) ->
113
%% @spec escape_attr(S::string()) -> string()
114
%% @doc Escape a string such that it's safe for HTML attrs
115
%% (amp; lt; gt; quot;).
116
escape_attr(B) when is_binary(B) ->
117
escape_attr(binary_to_list(B), []);
118
escape_attr(A) when is_atom(A) ->
119
escape_attr(atom_to_list(A), []);
120
escape_attr(S) when is_list(S) ->
122
escape_attr(I) when is_integer(I) ->
123
escape_attr(integer_to_list(I), []);
124
escape_attr(F) when is_float(F) ->
125
escape_attr(mochinum:digits(F), []).
127
%% @spec test() -> ok
128
%% @doc Run tests for mochiweb_html.
143
Expect = <<"<html><head><title>hey!</title></head><body><p class=\"foo\">what's up<br /></p><div>sucka</div><!-- comment! --></body></html>">>,
144
Expect = iolist_to_binary(
147
[{title, <<"hey!">>}]},
149
[{p, [{class, foo}], [<<"what's">>, <<" up">>, {br}]},
150
{'div', <<"sucka">>},
151
{comment, <<" comment! ">>}]}]})),
152
Expect1 = <<"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">">>,
153
Expect1 = iolist_to_binary(
155
[<<"html">>, <<"PUBLIC">>,
156
<<"-//W3C//DTD XHTML 1.0 Transitional//EN">>,
157
<<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">>]})),
161
to_html([{'=', Content} | Rest], Acc) ->
162
to_html(Rest, [Content | Acc]);
163
to_html([{pi, Tag, Attrs} | Rest], Acc) ->
166
attrs_to_html(Attrs, []),
168
to_html(Rest, [Open | Acc]);
169
to_html([{comment, Comment} | Rest], Acc) ->
170
to_html(Rest, [[<<"<!--">>, Comment, <<"-->">>] | Acc]);
171
to_html([{doctype, Parts} | Rest], Acc) ->
172
Inside = doctype_to_html(Parts, Acc),
173
to_html(Rest, [[<<"<!DOCTYPE">>, Inside, <<">">>] | Acc]);
174
to_html([{data, Data, _Whitespace} | Rest], Acc) ->
175
to_html(Rest, [escape(Data) | Acc]);
176
to_html([{start_tag, Tag, Attrs, Singleton} | Rest], Acc) ->
179
attrs_to_html(Attrs, []),
184
to_html(Rest, [Open | Acc]);
185
to_html([{end_tag, Tag} | Rest], Acc) ->
186
to_html(Rest, [[<<"</">>, Tag, <<">">>] | Acc]).
188
doctype_to_html([], Acc) ->
190
doctype_to_html([Word | Rest], Acc) ->
191
case lists:all(fun (C) -> ?IS_LITERAL_SAFE(C) end,
192
binary_to_list(iolist_to_binary(Word))) of
194
doctype_to_html(Rest, [[<<" ">>, Word] | Acc]);
196
doctype_to_html(Rest, [[<<" \"">>, escape_attr(Word), ?QUOTE] | Acc])
199
attrs_to_html([], Acc) ->
201
attrs_to_html([{K, V} | Rest], Acc) ->
203
[[<<" ">>, escape(K), <<"=\"">>,
204
escape_attr(V), <<"\"">>] | Acc]).
207
<<"&quot;\"word <<up!&quot;">> =
208
escape(<<""\"word <<up!"">>),
211
test_escape_attr() ->
212
<<"&quot;"word <<up!&quot;">> =
213
escape_attr(<<""\"word <<up!"">>),
217
list_to_binary(lists:reverse(Acc));
218
escape("<" ++ Rest, Acc) ->
219
escape(Rest, lists:reverse("<", Acc));
220
escape(">" ++ Rest, Acc) ->
221
escape(Rest, lists:reverse(">", Acc));
222
escape("&" ++ Rest, Acc) ->
223
escape(Rest, lists:reverse("&", Acc));
224
escape([C | Rest], Acc) ->
225
escape(Rest, [C | Acc]).
227
escape_attr([], Acc) ->
228
list_to_binary(lists:reverse(Acc));
229
escape_attr("<" ++ Rest, Acc) ->
230
escape_attr(Rest, lists:reverse("<", Acc));
231
escape_attr(">" ++ Rest, Acc) ->
232
escape_attr(Rest, lists:reverse(">", Acc));
233
escape_attr("&" ++ Rest, Acc) ->
234
escape_attr(Rest, lists:reverse("&", Acc));
235
escape_attr([?QUOTE | Rest], Acc) ->
236
escape_attr(Rest, lists:reverse(""", Acc));
237
escape_attr([C | Rest], Acc) ->
238
escape_attr(Rest, [C | Acc]).
240
to_tag(A) when is_atom(A) ->
241
norm(atom_to_list(A));
245
to_tokens([], Acc) ->
247
to_tokens([{Tag, []} | Rest], Acc) ->
248
to_tokens(Rest, [{end_tag, to_tag(Tag)} | Acc]);
249
to_tokens([{Tag0, [{T0} | R1]} | Rest], Acc) ->
251
to_tokens([{Tag0, [{T0, [], []} | R1]} | Rest], Acc);
252
to_tokens([{Tag0, [T0={'=', _C0} | R1]} | Rest], Acc) ->
253
%% Allow {'=', iolist()}
254
to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
255
to_tokens([{Tag0, [T0={comment, _C0} | R1]} | Rest], Acc) ->
256
%% Allow {comment, iolist()}
257
to_tokens([{Tag0, R1} | Rest], [T0 | Acc]);
258
to_tokens([{Tag0, [{T0, A0=[{_, _} | _]} | R1]} | Rest], Acc) ->
259
%% Allow {p, [{"class", "foo"}]}
260
to_tokens([{Tag0, [{T0, A0, []} | R1]} | Rest], Acc);
261
to_tokens([{Tag0, [{T0, C0} | R1]} | Rest], Acc) ->
262
%% Allow {p, "content"} and {p, <<"content">>}
263
to_tokens([{Tag0, [{T0, [], C0} | R1]} | Rest], Acc);
264
to_tokens([{Tag0, [{T0, A1, C0} | R1]} | Rest], Acc) when is_binary(C0) ->
265
%% Allow {"p", [{"class", "foo"}], <<"content">>}
266
to_tokens([{Tag0, [{T0, A1, binary_to_list(C0)} | R1]} | Rest], Acc);
267
to_tokens([{Tag0, [{T0, A1, C0=[C | _]} | R1]} | Rest], Acc)
268
when is_integer(C) ->
269
%% Allow {"p", [{"class", "foo"}], "content"}
270
to_tokens([{Tag0, [{T0, A1, [C0]} | R1]} | Rest], Acc);
271
to_tokens([{Tag0, [{T0, A1, C1} | R1]} | Rest], Acc) ->
272
%% Native {"p", [{"class", "foo"}], ["content"]}
275
case is_singleton(norm(T1)) of
277
to_tokens([{Tag, R1} | Rest], [{start_tag, T1, A1, true} | Acc]);
279
to_tokens([{T1, C1}, {Tag, R1} | Rest],
280
[{start_tag, T1, A1, false} | Acc])
282
to_tokens([{Tag0, [L | R1]} | Rest], Acc) when is_list(L) ->
285
to_tokens([{Tag, R1} | Rest], [{data, iolist_to_binary(L), false} | Acc]);
286
to_tokens([{Tag0, [B | R1]} | Rest], Acc) when is_binary(B) ->
289
to_tokens([{Tag, R1} | Rest], [{data, B, false} | Acc]).
292
[{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
293
{<<"wibble">>, <<"wibble">>},
294
{<<"alice">>, <<"bob">>}], true}] =
295
tokens(<<"<foo bar=baz wibble='wibble' alice=\"bob\"/>">>),
296
[{start_tag, <<"foo">>, [{<<"bar">>, <<"baz">>},
297
{<<"wibble">>, <<"wibble">>},
298
{<<"alice">>, <<"bob">>}], true}] =
299
tokens(<<"<foo bar=baz wibble='wibble' alice=bob/>">>),
300
[{comment, <<"[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]">>}] =
301
tokens(<<"<!--[if lt IE 7]>\n<style type=\"text/css\">\n.no_ie { display: none; }\n</style>\n<![endif]-->">>),
304
tokens(B, S=#decoder{offset=O}, Acc) ->
309
{Tag, S1} = tokenize(B, S),
310
tokens(B, S1, [Tag | Acc])
313
tokenize(B, S=#decoder{offset=O}) ->
315
<<_:O/binary, "<!--", _/binary>> ->
316
tokenize_comment(B, ?ADV_COL(S, 4));
317
<<_:O/binary, "<!DOCTYPE", _/binary>> ->
318
tokenize_doctype(B, ?ADV_COL(S, 10));
319
<<_:O/binary, "<![CDATA[", _/binary>> ->
320
tokenize_cdata(B, ?ADV_COL(S, 9));
321
<<_:O/binary, "<?", _/binary>> ->
322
{Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
323
{Attrs, S2} = tokenize_attributes(B, S1),
324
S3 = find_qgt(B, S2),
325
{{pi, Tag, Attrs}, S3};
326
<<_:O/binary, "&", _/binary>> ->
327
tokenize_charref(B, ?INC_COL(S));
328
<<_:O/binary, "</", _/binary>> ->
329
{Tag, S1} = tokenize_literal(B, ?ADV_COL(S, 2)),
330
{S2, _} = find_gt(B, S1),
331
{{end_tag, Tag}, S2};
332
<<_:O/binary, "<", C, _/binary>> when ?IS_WHITESPACE(C) ->
333
%% This isn't really strict HTML but we want this for markdown
334
tokenize_data(B, ?INC_COL(S));
335
<<_:O/binary, "<", _/binary>> ->
336
{Tag, S1} = tokenize_literal(B, ?INC_COL(S)),
337
{Attrs, S2} = tokenize_attributes(B, S1),
338
{S3, HasSlash} = find_gt(B, S2),
339
Singleton = HasSlash orelse is_singleton(norm(binary_to_list(Tag))),
340
{{start_tag, Tag, Attrs, Singleton}, S3};
346
D0 = <<"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">
349
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
351
<link rel=\"stylesheet\" type=\"text/css\" href=\"/static/rel/dojo/resources/dojo.css\" media=\"screen\">
352
<link rel=\"stylesheet\" type=\"text/css\" href=\"/static/foo.css\" media=\"screen\">
354
<style type=\"text/css\">
355
.no_ie { display: none; }
358
<link rel=\"icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
359
<link rel=\"shortcut icon\" href=\"/static/images/favicon.ico\" type=\"image/x-icon\">
361
<body id=\"home\" class=\"tundra\"><![CDATA[<<this<!-- is -->CDATA>>]]></body>
363
Expect = {<<"html">>, [],
366
[{<<"http-equiv">>,<<"Content-Type">>},
367
{<<"content">>,<<"text/html; charset=UTF-8">>}],
369
{<<"title">>,[],[<<"Foo">>]},
371
[{<<"rel">>,<<"stylesheet">>},
372
{<<"type">>,<<"text/css">>},
373
{<<"href">>,<<"/static/rel/dojo/resources/dojo.css">>},
374
{<<"media">>,<<"screen">>}],
377
[{<<"rel">>,<<"stylesheet">>},
378
{<<"type">>,<<"text/css">>},
379
{<<"href">>,<<"/static/foo.css">>},
380
{<<"media">>,<<"screen">>}],
382
{comment,<<"[if lt IE 7]>\n <style type=\"text/css\">\n .no_ie { display: none; }\n </style>\n <![endif]">>},
384
[{<<"rel">>,<<"icon">>},
385
{<<"href">>,<<"/static/images/favicon.ico">>},
386
{<<"type">>,<<"image/x-icon">>}],
389
[{<<"rel">>,<<"shortcut icon">>},
390
{<<"href">>,<<"/static/images/favicon.ico">>},
391
{<<"type">>,<<"image/x-icon">>}],
394
[{<<"id">>,<<"home">>},
395
{<<"class">>,<<"tundra">>}],
396
[<<"<<this<!-- is -->CDATA>>">>]}]},
400
test_parse_tokens() ->
401
D0 = [{doctype,[<<"HTML">>,<<"PUBLIC">>,<<"-//W3C//DTD HTML 4.01 Transitional//EN">>]},
402
{data,<<"\n">>,true},
403
{start_tag,<<"html">>,[],false}],
404
{<<"html">>, [], []} = parse_tokens(D0),
405
D1 = D0 ++ [{end_tag, <<"html">>}],
406
{<<"html">>, [], []} = parse_tokens(D1),
407
D2 = D0 ++ [{start_tag, <<"body">>, [], false}],
408
{<<"html">>, [], [{<<"body">>, [], []}]} = parse_tokens(D2),
409
D3 = D0 ++ [{start_tag, <<"head">>, [], false},
410
{end_tag, <<"head">>},
411
{start_tag, <<"body">>, [], false}],
412
{<<"html">>, [], [{<<"head">>, [], []}, {<<"body">>, [], []}]} = parse_tokens(D3),
413
D4 = D3 ++ [{data,<<"\n">>,true},
414
{start_tag,<<"div">>,[{<<"class">>,<<"a">>}],false},
415
{start_tag,<<"a">>,[{<<"name">>,<<"#anchor">>}],false},
418
{start_tag,<<"div">>,[{<<"class">>,<<"b">>}],false},
419
{start_tag,<<"div">>,[{<<"class">>,<<"c">>}],false},
421
{end_tag,<<"div">>}],
423
[{<<"head">>, [], []},
425
[{<<"div">>, [{<<"class">>, <<"a">>}], [{<<"a">>, [{<<"name">>, <<"#anchor">>}], []}]},
426
{<<"div">>, [{<<"class">>, <<"b">>}], [{<<"div">>, [{<<"class">>, <<"c">>}], []}]}
427
]}]} = parse_tokens(D4),
428
D5 = [{start_tag,<<"html">>,[],false},
429
{data,<<"\n">>,true},
430
{data,<<"boo">>,false},
431
{data,<<"hoo">>,false},
432
{data,<<"\n">>,true},
433
{end_tag,<<"html">>}],
434
{<<"html">>, [], [<<"\nboohoo\n">>]} = parse_tokens(D5),
435
D6 = [{start_tag,<<"html">>,[],false},
436
{data,<<"\n">>,true},
437
{data,<<"\n">>,true},
438
{end_tag,<<"html">>}],
439
{<<"html">>, [], []} = parse_tokens(D6),
440
D7 = [{start_tag,<<"html">>,[],false},
441
{start_tag,<<"ul">>,[],false},
442
{start_tag,<<"li">>,[],false},
443
{data,<<"word">>,false},
444
{start_tag,<<"li">>,[],false},
445
{data,<<"up">>,false},
447
{start_tag,<<"li">>,[],false},
448
{data,<<"fdsa">>,false},
449
{start_tag,<<"br">>,[],true},
450
{data,<<"asdf">>,false},
452
{end_tag,<<"html">>}],
455
[{<<"li">>, [], [<<"word">>]},
456
{<<"li">>, [], [<<"up">>]},
457
{<<"li">>, [], [<<"fdsa">>,{<<"br">>, [], []}, <<"asdf">>]}]}]} = parse_tokens(D7),
460
tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
461
tree_data(Rest, (Whitespace andalso AllWhitespace), [Data | Acc]);
462
tree_data(Rest, AllWhitespace, Acc) ->
463
{iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.
466
{destack(Stack), []};
467
tree([{end_tag, Tag} | Rest], Stack) ->
468
case destack(norm(Tag), Stack) of
474
tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
475
tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
476
tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
477
tree(Rest, stack(norm({Tag, Attrs}), S));
478
tree([T={pi, _Tag, _Attrs} | Rest], S) ->
479
tree(Rest, append_stack_child(T, S));
480
tree([T={comment, _Comment} | Rest], S) ->
481
tree(Rest, append_stack_child(T, S));
482
tree(L=[{data, _Data, _Whitespace} | _], S) ->
483
case tree_data(L, true, []) of
486
{Data, false, Rest} ->
487
tree(Rest, append_stack_child(Data, S))
490
norm({Tag, Attrs}) ->
491
{norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
492
norm(Tag) when is_binary(Tag) ->
495
list_to_binary(string:to_lower(Tag)).
499
destack([{<<"a">>, [], []}]),
500
{<<"a">>, [], [{<<"b">>, [], []}]} =
501
destack([{<<"b">>, [], []}, {<<"a">>, [], []}]),
502
{<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]} =
503
destack([{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
504
[{<<"a">>, [], [{<<"b">>, [], [{<<"c">>, [], []}]}]}] =
506
[{<<"c">>, [], []}, {<<"b">>, [], []}, {<<"a">>, [], []}]),
507
[{<<"b">>, [], [{<<"c">>, [], []}]}, {<<"a">>, [], []}] =
509
[{<<"c">>, [], []}, {<<"b">>, [], []},{<<"a">>, [], []}]),
512
stack(T1={TN, _, _}, Stack=[{TN, _, _} | _Rest])
513
when TN =:= <<"li">> orelse TN =:= <<"option">> ->
514
[T1 | destack(TN, Stack)];
515
stack(T1={TN0, _, _}, Stack=[{TN1, _, _} | _Rest])
516
when (TN0 =:= <<"dd">> orelse TN0 =:= <<"dt">>) andalso
517
(TN1 =:= <<"dd">> orelse TN1 =:= <<"dt">>) ->
518
[T1 | destack(TN1, Stack)];
522
append_stack_child(StartTag, [{Name, Attrs, Acc} | Stack]) ->
523
[{Name, Attrs, [StartTag | Acc]} | Stack].
525
destack(TagName, Stack) when is_list(Stack) ->
534
case lists:splitwith(F, Stack) of
536
%% No match, no state change
539
%% Unfurl the whole stack, we're done
541
{Pre, [T, {T0, A0, Acc0} | Post]} ->
542
%% Unfurl up to the tag, then accumulate it
543
[{T0, A0, [destack(Pre ++ [T]) | Acc0]} | Post]
546
destack([{Tag, Attrs, Acc}]) ->
547
{Tag, Attrs, lists:reverse(Acc)};
548
destack([{T1, A1, Acc1}, {T0, A0, Acc0} | Rest]) ->
549
destack([{T0, A0, [{T1, A1, lists:reverse(Acc1)} | Acc0]} | Rest]).
551
is_singleton(<<"br">>) -> true;
552
is_singleton(<<"hr">>) -> true;
553
is_singleton(<<"img">>) -> true;
554
is_singleton(<<"input">>) -> true;
555
is_singleton(<<"base">>) -> true;
556
is_singleton(<<"meta">>) -> true;
557
is_singleton(<<"link">>) -> true;
558
is_singleton(<<"area">>) -> true;
559
is_singleton(<<"param">>) -> true;
560
is_singleton(<<"col">>) -> true;
561
is_singleton(_) -> false.
563
tokenize_data(B, S=#decoder{offset=O}) ->
564
tokenize_data(B, S, O, true).
566
tokenize_data(B, S=#decoder{offset=O}, Start, Whitespace) ->
568
<<_:O/binary, C, _/binary>> when (C =/= $< andalso C =/= $&) ->
569
tokenize_data(B, ?INC_CHAR(S, C), Start,
570
(Whitespace andalso ?IS_WHITESPACE(C)));
573
<<_:Start/binary, Data:Len/binary, _/binary>> = B,
574
{{data, Data, Whitespace}, S}
577
tokenize_attributes(B, S) ->
578
tokenize_attributes(B, S, []).
580
tokenize_attributes(B, S=#decoder{offset=O}, Acc) ->
583
{lists:reverse(Acc), S};
584
<<_:O/binary, C, _/binary>> when (C =:= $> orelse C =:= $/) ->
585
{lists:reverse(Acc), S};
586
<<_:O/binary, "?>", _/binary>> ->
587
{lists:reverse(Acc), S};
588
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
589
tokenize_attributes(B, ?INC_CHAR(S, C), Acc);
591
{Attr, S1} = tokenize_literal(B, S),
592
{Value, S2} = tokenize_attr_value(Attr, B, S1),
593
tokenize_attributes(B, S2, [{Attr, Value} | Acc])
596
tokenize_attr_value(Attr, B, S) ->
597
S1 = skip_whitespace(B, S),
598
O = S1#decoder.offset,
600
<<_:O/binary, "=", _/binary>> ->
601
tokenize_word_or_literal(B, ?INC_COL(S1));
606
skip_whitespace(B, S=#decoder{offset=O}) ->
608
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
609
skip_whitespace(B, ?INC_CHAR(S, C));
614
tokenize_literal(Bin, S) ->
615
tokenize_literal(Bin, S, []).
617
tokenize_literal(Bin, S=#decoder{offset=O}, Acc) ->
619
<<_:O/binary, $&, _/binary>> ->
620
{{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
621
tokenize_literal(Bin, S1, [Data | Acc]);
622
<<_:O/binary, C, _/binary>> when not (?IS_WHITESPACE(C)
626
tokenize_literal(Bin, ?INC_COL(S), [C | Acc]);
628
{iolist_to_binary(lists:reverse(Acc)), S}
631
find_qgt(Bin, S=#decoder{offset=O}) ->
633
<<_:O/binary, "?>", _/binary>> ->
635
<<_:O/binary, C, _/binary>> ->
636
find_qgt(Bin, ?INC_CHAR(S, C));
642
find_gt(Bin, S, false).
644
find_gt(Bin, S=#decoder{offset=O}, HasSlash) ->
646
<<_:O/binary, $/, _/binary>> ->
647
find_gt(Bin, ?INC_COL(S), true);
648
<<_:O/binary, $>, _/binary>> ->
649
{?INC_COL(S), HasSlash};
650
<<_:O/binary, C, _/binary>> ->
651
find_gt(Bin, ?INC_CHAR(S, C), HasSlash);
656
tokenize_charref(Bin, S=#decoder{offset=O}) ->
657
tokenize_charref(Bin, S, O).
659
tokenize_charref(Bin, S=#decoder{offset=O}, Start) ->
662
<<_:Start/binary, Raw/binary>> = Bin,
663
{{data, Raw, false}, S};
664
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C)
670
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
671
{{data, Raw, false}, S};
672
<<_:O/binary, $;, _/binary>> ->
674
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
675
Data = case mochiweb_charref:charref(Raw) of
679
<<_:Start1/binary, R:Len1/binary, _/binary>> = Bin,
682
list_to_binary(xmerl_ucs:to_utf8(Unichar))
684
{{data, Data, false}, ?INC_COL(S)};
686
tokenize_charref(Bin, ?INC_COL(S), Start)
689
tokenize_doctype(Bin, S) ->
690
tokenize_doctype(Bin, S, []).
692
tokenize_doctype(Bin, S=#decoder{offset=O}, Acc) ->
695
{{doctype, lists:reverse(Acc)}, S};
696
<<_:O/binary, $>, _/binary>> ->
697
{{doctype, lists:reverse(Acc)}, ?INC_COL(S)};
698
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
699
tokenize_doctype(Bin, ?INC_CHAR(S, C), Acc);
701
{Word, S1} = tokenize_word_or_literal(Bin, S),
702
tokenize_doctype(Bin, S1, [Word | Acc])
705
tokenize_word_or_literal(Bin, S=#decoder{offset=O}) ->
707
<<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
708
{error, {whitespace, [C], S}};
709
<<_:O/binary, C, _/binary>> when C =:= ?QUOTE orelse C =:= ?SQUOTE ->
710
tokenize_word(Bin, ?INC_COL(S), C);
712
tokenize_literal(Bin, S, [])
715
tokenize_word(Bin, S, Quote) ->
716
tokenize_word(Bin, S, Quote, []).
718
tokenize_word(Bin, S=#decoder{offset=O}, Quote, Acc) ->
721
{iolist_to_binary(lists:reverse(Acc)), S};
722
<<_:O/binary, Quote, _/binary>> ->
723
{iolist_to_binary(lists:reverse(Acc)), ?INC_COL(S)};
724
<<_:O/binary, $&, _/binary>> ->
725
{{data, Data, false}, S1} = tokenize_charref(Bin, ?INC_COL(S)),
726
tokenize_word(Bin, S1, Quote, [Data | Acc]);
727
<<_:O/binary, C, _/binary>> ->
728
tokenize_word(Bin, ?INC_CHAR(S, C), Quote, [C | Acc])
731
tokenize_cdata(Bin, S=#decoder{offset=O}) ->
732
tokenize_cdata(Bin, S, O).
734
tokenize_cdata(Bin, S=#decoder{offset=O}, Start) ->
736
<<_:O/binary, "]]>", _/binary>> ->
738
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
739
{{data, Raw, false}, ?ADV_COL(S, 3)};
740
<<_:O/binary, C, _/binary>> ->
741
tokenize_cdata(Bin, ?INC_CHAR(S, C), Start);
743
<<_:O/binary, Raw/binary>> = Bin,
744
{{data, Raw, false}, S}
747
tokenize_comment(Bin, S=#decoder{offset=O}) ->
748
tokenize_comment(Bin, S, O).
750
tokenize_comment(Bin, S=#decoder{offset=O}, Start) ->
752
<<_:O/binary, "-->", _/binary>> ->
754
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,
755
{{comment, Raw}, ?ADV_COL(S, 3)};
756
<<_:O/binary, C, _/binary>> ->
757
tokenize_comment(Bin, ?INC_CHAR(S, C), Start);
758
<<_:Start/binary, Raw/binary>> ->