1
/* tags.c -- recognize HTML tags
3
(c) 1998-2001 (W3C) MIT, INRIA, Keio University
4
See tidy.c for the copyright notice.
7
The HTML tags are stored as 8 bit ASCII strings.
8
Use lookupw() to find a tag given a wide char string.
12
$Author: terry_teague $
13
$Date: 2001/09/01 04:15:40 $
19
#include "platform.h" /* platform independent stuff */
20
#include "html.h" /* to pull in definition of nodes */
34
Dict *tag_iframe; /* #433359 - fix by Randy Waki 12 Mar 01 */
89
Dict *xml_tags; /* dummy for xml tags */
91
static Dict *hashtab[HASHSIZE];
93
/* used by FindFirstDefinedTag and FindNextDefinedTag */
94
static Dict *tag_blink; /* a proprietary tag added by Tidy, along with tag_nobr, tag_wbr */
95
static Dict *curDictEntry;
96
static int curHashIndex;
104
CheckAttribs *chkattrs;
107
{"html", VERS_ALL, (CM_HTML|CM_OPT|CM_OMITST), ParseHTML, CheckHTML},
109
{"head", VERS_ALL, (CM_HTML|CM_OPT|CM_OMITST), ParseHead, null},
111
{"title", VERS_ALL, CM_HEAD, ParseTitle, null},
112
{"base", VERS_ALL, (CM_HEAD|CM_EMPTY), ParseEmpty, null},
113
{"link", VERS_ALL, (CM_HEAD|CM_EMPTY), ParseEmpty, CheckLINK},
114
{"meta", VERS_ALL, (CM_HEAD|CM_EMPTY), ParseEmpty, CheckMETA},
115
{"style", (VERS_FROM32)&~VERS_BASIC, CM_HEAD, ParseScript, CheckSTYLE},
116
{"script", (VERS_FROM32)&~VERS_BASIC, (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE), ParseScript, CheckSCRIPT},
117
{"server", VERS_NETSCAPE, (CM_HEAD|CM_MIXED|CM_BLOCK|CM_INLINE), ParseScript, null},
119
{"body", VERS_ALL, (CM_HTML|CM_OPT|CM_OMITST), ParseBody, null},
120
{"frameset", VERS_FRAMESET, (CM_HTML|CM_FRAMES), ParseFrameSet, null},
122
{"p", VERS_ALL, (CM_BLOCK|CM_OPT), ParseInline, null},
123
{"h1", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null},
124
{"h2", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null},
125
{"h3", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null},
126
{"h4", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null},
127
{"h5", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null},
128
{"h6", VERS_ALL, (CM_BLOCK|CM_HEADING), ParseInline, null},
129
{"ul", VERS_ALL, CM_BLOCK, ParseList, null},
130
{"ol", VERS_ALL, CM_BLOCK, ParseList, null},
131
{"dl", VERS_ALL, CM_BLOCK, ParseDefList, null},
132
{"dir", VERS_LOOSE, (CM_BLOCK|CM_OBSOLETE), ParseList, null},
133
{"menu", VERS_LOOSE, (CM_BLOCK|CM_OBSOLETE), ParseList, null},
134
{"pre", VERS_ALL, CM_BLOCK, ParsePre, null},
135
{"listing", VERS_ALL, (CM_BLOCK|CM_OBSOLETE), ParsePre, null},
136
{"xmp", VERS_ALL, (CM_BLOCK|CM_OBSOLETE), ParsePre, null},
137
{"plaintext", VERS_ALL, (CM_BLOCK|CM_OBSOLETE), ParsePre, null},
138
{"address", VERS_ALL, CM_BLOCK, ParseBlock, null},
139
{"blockquote", VERS_ALL, CM_BLOCK, ParseBlock, null},
140
{"form", VERS_ALL, CM_BLOCK, ParseBlock, CheckFORM},
141
{"isindex", VERS_LOOSE, (CM_BLOCK|CM_EMPTY), ParseEmpty, null},
142
{"fieldset", (VERS_HTML40)&~VERS_BASIC, CM_BLOCK, ParseBlock, null},
143
{"table", VERS_FROM32, CM_BLOCK, ParseTableTag, CheckTABLE},
144
{"hr", (VERS_ALL)&~VERS_BASIC, (CM_BLOCK|CM_EMPTY), ParseEmpty, CheckHR},
145
{"div", VERS_FROM32, CM_BLOCK, ParseBlock, null},
146
{"multicol", VERS_NETSCAPE, CM_BLOCK, ParseBlock, null},
147
{"nosave", VERS_NETSCAPE, CM_BLOCK, ParseBlock, null},
148
{"layer", VERS_NETSCAPE, CM_BLOCK, ParseBlock, null},
149
{"ilayer", VERS_NETSCAPE, CM_INLINE, ParseInline, null},
150
{"nolayer", VERS_NETSCAPE, (CM_BLOCK|CM_INLINE|CM_MIXED), ParseBlock, null},
151
{"align", VERS_NETSCAPE, CM_BLOCK, ParseBlock, null},
152
{"center", VERS_LOOSE, CM_BLOCK, ParseBlock, null},
153
{"ins", (VERS_HTML40)&~VERS_BASIC, (CM_INLINE|CM_BLOCK|CM_MIXED), ParseInline, null},
154
{"del", (VERS_HTML40)&~VERS_BASIC, (CM_INLINE|CM_BLOCK|CM_MIXED), ParseInline, null},
156
{"li", VERS_ALL, (CM_LIST|CM_OPT|CM_NO_INDENT), ParseBlock, null},
157
{"dt", VERS_ALL, (CM_DEFLIST|CM_OPT|CM_NO_INDENT), ParseInline, null},
158
{"dd", VERS_ALL, (CM_DEFLIST|CM_OPT|CM_NO_INDENT), ParseBlock, null},
160
{"caption", VERS_FROM32, CM_TABLE, ParseInline, CheckCaption},
161
{"colgroup", VERS_HTML40, (CM_TABLE|CM_OPT), ParseColGroup, null},
162
{"col", VERS_HTML40, (CM_TABLE|CM_EMPTY), ParseEmpty, null},
163
{"thead", (VERS_HTML40)&~VERS_BASIC, (CM_TABLE|CM_ROWGRP|CM_OPT), ParseRowGroup, null},
164
{"tfoot", (VERS_HTML40)&~VERS_BASIC, (CM_TABLE|CM_ROWGRP|CM_OPT), ParseRowGroup, null},
165
{"tbody", (VERS_HTML40)&~VERS_BASIC, (CM_TABLE|CM_ROWGRP|CM_OPT), ParseRowGroup, null},
166
{"tr", VERS_FROM32, (CM_TABLE|CM_OPT), ParseRow, null},
167
{"td", VERS_FROM32, (CM_ROW|CM_OPT|CM_NO_INDENT), ParseBlock, CheckTableCell},
168
{"th", VERS_FROM32, (CM_ROW|CM_OPT|CM_NO_INDENT), ParseBlock, CheckTableCell},
170
{"q", VERS_HTML40, CM_INLINE, ParseInline, null},
171
{"a", VERS_ALL, CM_INLINE, ParseInline, CheckAnchor},
172
{"br", VERS_ALL, (CM_INLINE|CM_EMPTY), ParseEmpty, null},
173
{"img", VERS_ALL, (CM_INLINE|CM_IMG|CM_EMPTY), ParseEmpty, CheckIMG},
174
{"object", VERS_HTML40, (CM_OBJECT|CM_HEAD|CM_IMG|CM_INLINE|CM_PARAM), ParseBlock, null},
175
{"applet", VERS_LOOSE, (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM), ParseBlock, null},
176
{"servlet", VERS_SUN, (CM_OBJECT|CM_IMG|CM_INLINE|CM_PARAM), ParseBlock, null},
177
{"param", VERS_FROM32, (CM_INLINE|CM_EMPTY), ParseEmpty, null},
178
{"embed", VERS_NETSCAPE, (CM_INLINE|CM_IMG|CM_EMPTY), ParseEmpty, null},
179
{"noembed", VERS_NETSCAPE, CM_INLINE, ParseInline, null},
180
{"iframe", VERS_IFRAME, CM_INLINE, ParseBlock, null},
181
{"frame", VERS_FRAMESET, (CM_FRAMES|CM_EMPTY), ParseEmpty, null},
182
{"noframes", VERS_IFRAME, (CM_BLOCK|CM_FRAMES), ParseNoFrames, null},
183
{"noscript", (VERS_HTML40)&~VERS_BASIC, (CM_BLOCK|CM_INLINE|CM_MIXED), ParseBlock, null},
184
{"b", (VERS_ALL)&~VERS_BASIC, CM_INLINE, ParseInline, null},
185
{"i", (VERS_ALL)&~VERS_BASIC, CM_INLINE, ParseInline, null},
186
{"u", VERS_LOOSE, CM_INLINE, ParseInline, null},
187
{"tt", (VERS_ALL)&~VERS_BASIC, CM_INLINE, ParseInline, null},
188
{"s", VERS_LOOSE, CM_INLINE, ParseInline, null},
189
{"strike", VERS_LOOSE, CM_INLINE, ParseInline, null},
190
{"big", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseInline, null},
191
{"small", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseInline, null},
192
{"sub", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseInline, null},
193
{"sup", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseInline, null},
194
{"em", VERS_ALL, CM_INLINE, ParseInline, null},
195
{"strong", VERS_ALL, CM_INLINE, ParseInline, null},
196
{"dfn", VERS_ALL, CM_INLINE, ParseInline, null},
197
{"code", VERS_ALL, CM_INLINE, ParseInline, null},
198
{"samp", VERS_ALL, CM_INLINE, ParseInline, null},
199
{"kbd", VERS_ALL, CM_INLINE, ParseInline, null},
200
{"var", VERS_ALL, CM_INLINE, ParseInline, null},
201
{"cite", VERS_ALL, CM_INLINE, ParseInline, null},
202
{"abbr", VERS_HTML40, CM_INLINE, ParseInline, null},
203
{"acronym", VERS_HTML40, CM_INLINE, ParseInline, null},
204
{"span", VERS_FROM32, CM_INLINE, ParseInline, null},
205
{"blink", VERS_PROPRIETARY, CM_INLINE, ParseInline, null},
206
{"nobr", VERS_PROPRIETARY, CM_INLINE, ParseInline, null},
207
{"wbr", VERS_PROPRIETARY, (CM_INLINE|CM_EMPTY), ParseEmpty, null},
208
{"marquee", VERS_MICROSOFT, (CM_INLINE|CM_OPT), ParseInline, null},
209
{"bgsound", VERS_MICROSOFT, (CM_HEAD|CM_EMPTY), ParseEmpty, null},
210
{"comment", VERS_MICROSOFT, CM_INLINE, ParseInline, null},
211
{"spacer", VERS_NETSCAPE, (CM_INLINE|CM_EMPTY), ParseEmpty, null},
212
{"keygen", VERS_NETSCAPE, (CM_INLINE|CM_EMPTY), ParseEmpty, null},
213
/* next 2 are already defined above - does no harm though */
214
{"nolayer", VERS_NETSCAPE, (CM_BLOCK|CM_INLINE|CM_MIXED), ParseBlock, null},
215
{"ilayer", VERS_NETSCAPE, CM_INLINE, ParseInline, null},
216
{"map", (VERS_FROM32)&~VERS_BASIC, CM_INLINE, ParseBlock, CheckMap},
217
{"area", (VERS_ALL)&~VERS_BASIC, (CM_BLOCK|CM_EMPTY), ParseEmpty, CheckAREA},
218
{"input", VERS_ALL, (CM_INLINE|CM_IMG|CM_EMPTY), ParseEmpty, null},
219
{"select", VERS_ALL, (CM_INLINE|CM_FIELD), ParseSelect, null},
220
{"option", VERS_ALL, (CM_FIELD|CM_OPT), ParseText, null},
221
{"optgroup", (VERS_HTML40)&~VERS_BASIC, (CM_FIELD|CM_OPT), ParseOptGroup, null},
222
{"textarea", VERS_ALL, (CM_INLINE|CM_FIELD), ParseText, null},
223
{"label", VERS_HTML40, CM_INLINE, ParseInline, null},
224
{"legend", (VERS_HTML40)&~VERS_BASIC, CM_INLINE, ParseInline, null},
225
{"button", (VERS_HTML40)&~VERS_BASIC, CM_INLINE, ParseInline, null},
226
{"basefont", VERS_LOOSE, (CM_INLINE|CM_EMPTY), ParseEmpty, null},
227
{"font", VERS_LOOSE, CM_INLINE, ParseInline, null},
228
{"bdo", (VERS_HTML40)&~VERS_BASIC, CM_INLINE, ParseInline, null},
230
/* elements for XHTML 1.1 */
232
{"ruby", VERS_XHTML11, CM_INLINE, ParseInline, null},
233
{"rbc", VERS_XHTML11, CM_INLINE, ParseInline, null},
234
{"rtc", VERS_XHTML11, CM_INLINE, ParseInline, null},
235
{"rb", VERS_XHTML11, CM_INLINE, ParseInline, null},
236
{"rt", VERS_XHTML11, CM_INLINE, ParseInline, null},
237
{"rp", VERS_XHTML11, CM_INLINE, ParseInline, null},
240
/* this must be the final entry */
244
/* choose what version to use for new doctype */
245
int HTMLVersion(Lexer *lexer)
249
versions = lexer->versions;
251
if (versions & VERS_HTML20)
254
if (!(XmlOut|XmlTags|lexer->isvoyager) &&
255
versions & VERS_HTML32)
258
if (versions & VERS_XHTML11)
261
if (versions & VERS_HTML40_STRICT)
262
return VERS_HTML40_STRICT;
264
if (versions & VERS_HTML40_LOOSE)
265
return VERS_HTML40_LOOSE;
267
if (versions & VERS_FRAMESET)
268
return VERS_FRAMESET;
273
static unsigned hash(char *s)
277
for (hashval = 0; *s != '\0'; s++)
278
hashval = *s + 31*hashval;
280
return hashval % HASHSIZE;
283
static Dict *lookup(char *s)
287
for (np = hashtab[hash(s)]; np != null; np = np->next)
288
if (wstrcmp(s, np->name) == 0)
293
static Dict *install(char *name, uint versions, uint model,
294
Parser *parser, CheckAttribs *chkattrs)
299
if ((np = lookup(name)) == null)
301
np = (Dict *)MemAlloc(sizeof(*np));
303
if (np == null || (np->name = wstrdup(name)) == null)
306
hashval = hash(name);
307
np->next = hashtab[hashval];
309
hashtab[hashval] = np;
312
np->versions = versions;
315
np->chkattrs = chkattrs;
319
/* public interface for finding tag by name */
320
Bool FindTag(Node *node)
326
node->tag = xml_tags;
330
if (node->element && (np = lookup(node->element)))
339
Parser *FindParser(Node *node)
343
if (node->element && (np = lookup(node->element)))
349
void DefineTag(int tagType, char *name)
354
install(name, VERS_PROPRIETARY, (CM_EMPTY|CM_NO_INDENT|CM_NEW), ParseBlock, null);
356
case tagtype_inline :
357
install(name, VERS_PROPRIETARY, (CM_INLINE|CM_NO_INDENT|CM_NEW), ParseInline, null);
360
install(name, VERS_PROPRIETARY, (CM_BLOCK|CM_NO_INDENT|CM_NEW), ParseBlock, null);
363
install(name, VERS_PROPRIETARY, (CM_BLOCK|CM_NO_INDENT|CM_NEW), ParsePre, null);
368
void ResetDefinedTagSearch(void)
374
char *FindNextDefinedTag(int tagType)
376
char *tagName = null;
380
if (curDictEntry != null)
384
/* defined tags can be empty + inline */
386
if ((curDictEntry->versions == VERS_PROPRIETARY) &&
387
((curDictEntry->model & CM_EMPTY) == CM_EMPTY) &&
388
/* (curDictEntry->parser == ParseBlock) && */
389
(curDictEntry != tag_wbr))
390
tagName = curDictEntry->name;
392
/* defined tags can be empty + inline */
393
case tagtype_inline :
394
if ((curDictEntry->versions == VERS_PROPRIETARY) &&
395
((curDictEntry->model & CM_INLINE) == CM_INLINE) &&
396
/* (curDictEntry->parser == ParseInline) && */
397
(curDictEntry != tag_blink) &&
398
(curDictEntry != tag_nobr) &&
399
(curDictEntry != tag_wbr))
400
tagName = curDictEntry->name;
402
/* defined tags can be empty + block */
404
if ((curDictEntry->versions == VERS_PROPRIETARY) &&
405
((curDictEntry->model & CM_BLOCK) == CM_BLOCK) &&
406
(curDictEntry->parser == ParseBlock))
407
tagName = curDictEntry->name;
410
if ((curDictEntry->versions == VERS_PROPRIETARY) &&
411
((curDictEntry->model & CM_BLOCK) == CM_BLOCK) &&
412
(curDictEntry->parser == ParsePre))
413
tagName = curDictEntry->name;
417
curDictEntry = curDictEntry->next;
420
if (curDictEntry == null)
423
curDictEntry = hashtab[curHashIndex++];
424
} while ((curDictEntry == null) && (curHashIndex < HASHSIZE));
426
} while ((tagName == null) && (curDictEntry != null));
435
for(tp = tags; tp->name != null; ++tp)
436
install(tp->name, tp->versions, tp->model, tp->parser, tp->chkattrs);
438
tag_html = lookup("html");
439
tag_head = lookup("head");
440
tag_body = lookup("body");
441
tag_frameset = lookup("frameset");
442
tag_frame = lookup("frame");
443
tag_iframe = lookup("iframe"); /* #433359 - fix by Randy Waki 12 Mar 01 */
444
tag_noframes = lookup("noframes");
445
tag_meta = lookup("meta");
446
tag_title = lookup("title");
447
tag_base = lookup("base");
448
tag_hr = lookup("hr");
449
tag_pre = lookup("pre");
450
tag_listing = lookup("listing");
451
tag_h1 = lookup("h1");
452
tag_h2 = lookup("h2");
454
tag_ul = lookup("ul");
455
tag_ol = lookup("ol");
456
tag_dir = lookup("dir");
457
tag_li = lookup("li");
458
tag_dl = lookup("dl");
459
tag_dt = lookup("dt");
460
tag_dd = lookup("dd");
461
tag_td = lookup("td");
462
tag_th = lookup("th");
463
tag_tr = lookup("tr");
464
tag_col = lookup("col");
465
tag_br = lookup("br");
467
tag_link = lookup("link");
470
tag_strong = lookup("strong");
471
tag_em = lookup("em");
472
tag_big = lookup("big");
473
tag_small = lookup("small");
474
tag_param = lookup("param");
475
tag_option = lookup("option");
476
tag_optgroup = lookup("optgroup");
477
tag_img = lookup("img");
478
tag_map = lookup("map");
479
tag_area = lookup("area");
480
tag_nobr = lookup("nobr");
481
tag_wbr = lookup("wbr");
482
tag_font = lookup("font");
483
tag_spacer = lookup("spacer");
484
tag_layer = lookup("layer");
485
tag_center = lookup("center");
486
tag_style = lookup("style");
487
tag_script = lookup("script");
488
tag_noscript = lookup("noscript");
489
tag_table = lookup("table");
490
tag_caption = lookup("caption");
491
tag_form = lookup("form");
492
tag_textarea = lookup("textarea");
493
tag_blockquote = lookup("blockquote");
494
tag_applet = lookup("applet");
495
tag_object = lookup("object");
496
tag_div = lookup("div");
497
tag_span = lookup("span");
498
tag_input = lookup("input");
501
/* create dummy entry for all xml tags */
502
xml_tags = (Dict *)MemAlloc(sizeof(*xml_tags));
503
xml_tags->name = null;
504
xml_tags->versions = VERS_ALL;
505
xml_tags->model = CM_BLOCK;
506
xml_tags->parser = null;
507
xml_tags->chkattrs = null;
509
tag_blink = lookup("blink"); /* so we can skip this in the search for user defined tags */
519
for (i = 0; i < HASHSIZE; ++i)