1
1
/* entities.c -- recognize HTML ISO entities
3
(c) 1998-2001 (W3C) MIT, INRIA, Keio University
4
See tidy.c for the copyright notice.
3
(c) 1998-2004 (W3C) MIT, ERCIM, Keio University
4
See tidy.h for the copyright notice.
8
8
$Author: terry_teague $
9
$Date: 2001/08/19 19:18:57 $
9
$Date: 2004/08/02 02:25:13 $
12
Entity handling can be static because there are no config or
13
document-specific values. Lookup table is 100% defined at
27
static struct nlist *hashtab[HASHSIZE];
291
static unsigned hash(char *s)
295
for (hashval = 0; *s != '\0'; s++)
296
hashval = *s + 31*hashval;
298
return hashval % HASHSIZE;
301
static struct nlist *lookup(char *s)
305
for (np = hashtab[hash(s)]; np != null; np = np->next)
306
if (wstrcmp(s, np->name) == 0)
24
typedef struct _entity entity;
34
static const entity entities[] =
37
** Markup pre-defined character entities
39
{ "quot", VERS_ALL|VERS_XML, 34 },
40
{ "amp", VERS_ALL|VERS_XML, 38 },
41
{ "apos", VERS_FROM40|VERS_XML, 39 },
42
{ "lt", VERS_ALL|VERS_XML, 60 },
43
{ "gt", VERS_ALL|VERS_XML, 62 },
46
** Latin-1 character entities
48
{ "nbsp", VERS_ALL, 160 },
49
{ "iexcl", VERS_ALL, 161 },
50
{ "cent", VERS_ALL, 162 },
51
{ "pound", VERS_ALL, 163 },
52
{ "curren", VERS_ALL, 164 },
53
{ "yen", VERS_ALL, 165 },
54
{ "brvbar", VERS_ALL, 166 },
55
{ "sect", VERS_ALL, 167 },
56
{ "uml", VERS_ALL, 168 },
57
{ "copy", VERS_ALL, 169 },
58
{ "ordf", VERS_ALL, 170 },
59
{ "laquo", VERS_ALL, 171 },
60
{ "not", VERS_ALL, 172 },
61
{ "shy", VERS_ALL, 173 },
62
{ "reg", VERS_ALL, 174 },
63
{ "macr", VERS_ALL, 175 },
64
{ "deg", VERS_ALL, 176 },
65
{ "plusmn", VERS_ALL, 177 },
66
{ "sup2", VERS_ALL, 178 },
67
{ "sup3", VERS_ALL, 179 },
68
{ "acute", VERS_ALL, 180 },
69
{ "micro", VERS_ALL, 181 },
70
{ "para", VERS_ALL, 182 },
71
{ "middot", VERS_ALL, 183 },
72
{ "cedil", VERS_ALL, 184 },
73
{ "sup1", VERS_ALL, 185 },
74
{ "ordm", VERS_ALL, 186 },
75
{ "raquo", VERS_ALL, 187 },
76
{ "frac14", VERS_ALL, 188 },
77
{ "frac12", VERS_ALL, 189 },
78
{ "frac34", VERS_ALL, 190 },
79
{ "iquest", VERS_ALL, 191 },
80
{ "Agrave", VERS_ALL, 192 },
81
{ "Aacute", VERS_ALL, 193 },
82
{ "Acirc", VERS_ALL, 194 },
83
{ "Atilde", VERS_ALL, 195 },
84
{ "Auml", VERS_ALL, 196 },
85
{ "Aring", VERS_ALL, 197 },
86
{ "AElig", VERS_ALL, 198 },
87
{ "Ccedil", VERS_ALL, 199 },
88
{ "Egrave", VERS_ALL, 200 },
89
{ "Eacute", VERS_ALL, 201 },
90
{ "Ecirc", VERS_ALL, 202 },
91
{ "Euml", VERS_ALL, 203 },
92
{ "Igrave", VERS_ALL, 204 },
93
{ "Iacute", VERS_ALL, 205 },
94
{ "Icirc", VERS_ALL, 206 },
95
{ "Iuml", VERS_ALL, 207 },
96
{ "ETH", VERS_ALL, 208 },
97
{ "Ntilde", VERS_ALL, 209 },
98
{ "Ograve", VERS_ALL, 210 },
99
{ "Oacute", VERS_ALL, 211 },
100
{ "Ocirc", VERS_ALL, 212 },
101
{ "Otilde", VERS_ALL, 213 },
102
{ "Ouml", VERS_ALL, 214 },
103
{ "times", VERS_ALL, 215 },
104
{ "Oslash", VERS_ALL, 216 },
105
{ "Ugrave", VERS_ALL, 217 },
106
{ "Uacute", VERS_ALL, 218 },
107
{ "Ucirc", VERS_ALL, 219 },
108
{ "Uuml", VERS_ALL, 220 },
109
{ "Yacute", VERS_ALL, 221 },
110
{ "THORN", VERS_ALL, 222 },
111
{ "szlig", VERS_ALL, 223 },
112
{ "agrave", VERS_ALL, 224 },
113
{ "aacute", VERS_ALL, 225 },
114
{ "acirc", VERS_ALL, 226 },
115
{ "atilde", VERS_ALL, 227 },
116
{ "auml", VERS_ALL, 228 },
117
{ "aring", VERS_ALL, 229 },
118
{ "aelig", VERS_ALL, 230 },
119
{ "ccedil", VERS_ALL, 231 },
120
{ "egrave", VERS_ALL, 232 },
121
{ "eacute", VERS_ALL, 233 },
122
{ "ecirc", VERS_ALL, 234 },
123
{ "euml", VERS_ALL, 235 },
124
{ "igrave", VERS_ALL, 236 },
125
{ "iacute", VERS_ALL, 237 },
126
{ "icirc", VERS_ALL, 238 },
127
{ "iuml", VERS_ALL, 239 },
128
{ "eth", VERS_ALL, 240 },
129
{ "ntilde", VERS_ALL, 241 },
130
{ "ograve", VERS_ALL, 242 },
131
{ "oacute", VERS_ALL, 243 },
132
{ "ocirc", VERS_ALL, 244 },
133
{ "otilde", VERS_ALL, 245 },
134
{ "ouml", VERS_ALL, 246 },
135
{ "divide", VERS_ALL, 247 },
136
{ "oslash", VERS_ALL, 248 },
137
{ "ugrave", VERS_ALL, 249 },
138
{ "uacute", VERS_ALL, 250 },
139
{ "ucirc", VERS_ALL, 251 },
140
{ "uuml", VERS_ALL, 252 },
141
{ "yacute", VERS_ALL, 253 },
142
{ "thorn", VERS_ALL, 254 },
143
{ "yuml", VERS_ALL, 255 },
146
** Extended Entities defined in HTML 4: Symbols
148
{ "fnof", VERS_FROM40, 402 },
149
{ "Alpha", VERS_FROM40, 913 },
150
{ "Beta", VERS_FROM40, 914 },
151
{ "Gamma", VERS_FROM40, 915 },
152
{ "Delta", VERS_FROM40, 916 },
153
{ "Epsilon", VERS_FROM40, 917 },
154
{ "Zeta", VERS_FROM40, 918 },
155
{ "Eta", VERS_FROM40, 919 },
156
{ "Theta", VERS_FROM40, 920 },
157
{ "Iota", VERS_FROM40, 921 },
158
{ "Kappa", VERS_FROM40, 922 },
159
{ "Lambda", VERS_FROM40, 923 },
160
{ "Mu", VERS_FROM40, 924 },
161
{ "Nu", VERS_FROM40, 925 },
162
{ "Xi", VERS_FROM40, 926 },
163
{ "Omicron", VERS_FROM40, 927 },
164
{ "Pi", VERS_FROM40, 928 },
165
{ "Rho", VERS_FROM40, 929 },
166
{ "Sigma", VERS_FROM40, 931 },
167
{ "Tau", VERS_FROM40, 932 },
168
{ "Upsilon", VERS_FROM40, 933 },
169
{ "Phi", VERS_FROM40, 934 },
170
{ "Chi", VERS_FROM40, 935 },
171
{ "Psi", VERS_FROM40, 936 },
172
{ "Omega", VERS_FROM40, 937 },
173
{ "alpha", VERS_FROM40, 945 },
174
{ "beta", VERS_FROM40, 946 },
175
{ "gamma", VERS_FROM40, 947 },
176
{ "delta", VERS_FROM40, 948 },
177
{ "epsilon", VERS_FROM40, 949 },
178
{ "zeta", VERS_FROM40, 950 },
179
{ "eta", VERS_FROM40, 951 },
180
{ "theta", VERS_FROM40, 952 },
181
{ "iota", VERS_FROM40, 953 },
182
{ "kappa", VERS_FROM40, 954 },
183
{ "lambda", VERS_FROM40, 955 },
184
{ "mu", VERS_FROM40, 956 },
185
{ "nu", VERS_FROM40, 957 },
186
{ "xi", VERS_FROM40, 958 },
187
{ "omicron", VERS_FROM40, 959 },
188
{ "pi", VERS_FROM40, 960 },
189
{ "rho", VERS_FROM40, 961 },
190
{ "sigmaf", VERS_FROM40, 962 },
191
{ "sigma", VERS_FROM40, 963 },
192
{ "tau", VERS_FROM40, 964 },
193
{ "upsilon", VERS_FROM40, 965 },
194
{ "phi", VERS_FROM40, 966 },
195
{ "chi", VERS_FROM40, 967 },
196
{ "psi", VERS_FROM40, 968 },
197
{ "omega", VERS_FROM40, 969 },
198
{ "thetasym", VERS_FROM40, 977 },
199
{ "upsih", VERS_FROM40, 978 },
200
{ "piv", VERS_FROM40, 982 },
201
{ "bull", VERS_FROM40, 8226 },
202
{ "hellip", VERS_FROM40, 8230 },
203
{ "prime", VERS_FROM40, 8242 },
204
{ "Prime", VERS_FROM40, 8243 },
205
{ "oline", VERS_FROM40, 8254 },
206
{ "frasl", VERS_FROM40, 8260 },
207
{ "weierp", VERS_FROM40, 8472 },
208
{ "image", VERS_FROM40, 8465 },
209
{ "real", VERS_FROM40, 8476 },
210
{ "trade", VERS_FROM40, 8482 },
211
{ "alefsym", VERS_FROM40, 8501 },
212
{ "larr", VERS_FROM40, 8592 },
213
{ "uarr", VERS_FROM40, 8593 },
214
{ "rarr", VERS_FROM40, 8594 },
215
{ "darr", VERS_FROM40, 8595 },
216
{ "harr", VERS_FROM40, 8596 },
217
{ "crarr", VERS_FROM40, 8629 },
218
{ "lArr", VERS_FROM40, 8656 },
219
{ "uArr", VERS_FROM40, 8657 },
220
{ "rArr", VERS_FROM40, 8658 },
221
{ "dArr", VERS_FROM40, 8659 },
222
{ "hArr", VERS_FROM40, 8660 },
223
{ "forall", VERS_FROM40, 8704 },
224
{ "part", VERS_FROM40, 8706 },
225
{ "exist", VERS_FROM40, 8707 },
226
{ "empty", VERS_FROM40, 8709 },
227
{ "nabla", VERS_FROM40, 8711 },
228
{ "isin", VERS_FROM40, 8712 },
229
{ "notin", VERS_FROM40, 8713 },
230
{ "ni", VERS_FROM40, 8715 },
231
{ "prod", VERS_FROM40, 8719 },
232
{ "sum", VERS_FROM40, 8721 },
233
{ "minus", VERS_FROM40, 8722 },
234
{ "lowast", VERS_FROM40, 8727 },
235
{ "radic", VERS_FROM40, 8730 },
236
{ "prop", VERS_FROM40, 8733 },
237
{ "infin", VERS_FROM40, 8734 },
238
{ "ang", VERS_FROM40, 8736 },
239
{ "and", VERS_FROM40, 8743 },
240
{ "or", VERS_FROM40, 8744 },
241
{ "cap", VERS_FROM40, 8745 },
242
{ "cup", VERS_FROM40, 8746 },
243
{ "int", VERS_FROM40, 8747 },
244
{ "there4", VERS_FROM40, 8756 },
245
{ "sim", VERS_FROM40, 8764 },
246
{ "cong", VERS_FROM40, 8773 },
247
{ "asymp", VERS_FROM40, 8776 },
248
{ "ne", VERS_FROM40, 8800 },
249
{ "equiv", VERS_FROM40, 8801 },
250
{ "le", VERS_FROM40, 8804 },
251
{ "ge", VERS_FROM40, 8805 },
252
{ "sub", VERS_FROM40, 8834 },
253
{ "sup", VERS_FROM40, 8835 },
254
{ "nsub", VERS_FROM40, 8836 },
255
{ "sube", VERS_FROM40, 8838 },
256
{ "supe", VERS_FROM40, 8839 },
257
{ "oplus", VERS_FROM40, 8853 },
258
{ "otimes", VERS_FROM40, 8855 },
259
{ "perp", VERS_FROM40, 8869 },
260
{ "sdot", VERS_FROM40, 8901 },
261
{ "lceil", VERS_FROM40, 8968 },
262
{ "rceil", VERS_FROM40, 8969 },
263
{ "lfloor", VERS_FROM40, 8970 },
264
{ "rfloor", VERS_FROM40, 8971 },
265
{ "lang", VERS_FROM40, 9001 },
266
{ "rang", VERS_FROM40, 9002 },
267
{ "loz", VERS_FROM40, 9674 },
268
{ "spades", VERS_FROM40, 9824 },
269
{ "clubs", VERS_FROM40, 9827 },
270
{ "hearts", VERS_FROM40, 9829 },
271
{ "diams", VERS_FROM40, 9830 },
274
** Extended Entities defined in HTML 4: Special (less Markup at top)
276
{ "OElig", VERS_FROM40, 338 },
277
{ "oelig", VERS_FROM40, 339 },
278
{ "Scaron", VERS_FROM40, 352 },
279
{ "scaron", VERS_FROM40, 353 },
280
{ "Yuml", VERS_FROM40, 376 },
281
{ "circ", VERS_FROM40, 710 },
282
{ "tilde", VERS_FROM40, 732 },
283
{ "ensp", VERS_FROM40, 8194 },
284
{ "emsp", VERS_FROM40, 8195 },
285
{ "thinsp", VERS_FROM40, 8201 },
286
{ "zwnj", VERS_FROM40, 8204 },
287
{ "zwj", VERS_FROM40, 8205 },
288
{ "lrm", VERS_FROM40, 8206 },
289
{ "rlm", VERS_FROM40, 8207 },
290
{ "ndash", VERS_FROM40, 8211 },
291
{ "mdash", VERS_FROM40, 8212 },
292
{ "lsquo", VERS_FROM40, 8216 },
293
{ "rsquo", VERS_FROM40, 8217 },
294
{ "sbquo", VERS_FROM40, 8218 },
295
{ "ldquo", VERS_FROM40, 8220 },
296
{ "rdquo", VERS_FROM40, 8221 },
297
{ "bdquo", VERS_FROM40, 8222 },
298
{ "dagger", VERS_FROM40, 8224 },
299
{ "Dagger", VERS_FROM40, 8225 },
300
{ "permil", VERS_FROM40, 8240 },
301
{ "lsaquo", VERS_FROM40, 8249 },
302
{ "rsaquo", VERS_FROM40, 8250 },
303
{ "euro", VERS_FROM40, 8364 },
308
/* Pure static implementation. Trades off lookup speed
309
** for faster setup time (well, none actually).
310
** Optimization of comparing 1st character buys enough
311
** speed that hash doesn't improve things without > 500
314
static const entity* lookup( ctmbstr s )
316
tmbchar ch = (tmbchar)( s ? *s : 0 );
318
for ( np = entities; ch && np && np->name; ++np )
319
if ( ch == *np->name && tmbstrcmp(s, np->name) == 0 )
311
static struct nlist *install(char *name, uint code)
316
if ((np = lookup(name)) == null)
318
np = (struct nlist *)MemAlloc(sizeof(*np));
320
if (np == null || (np->name = wstrdup(name)) == null)
323
hashval = hash(name);
324
np->next = hashtab[hashval];
325
hashtab[hashval] = np;
333
324
/* entity starting with "&" returns zero on error */
334
uint EntityCode(char *name)
325
uint EntityCode( ctmbstr name, uint versions )
328
assert( name && name[0] == '&' );
339
330
/* numeric entitity: name = "&#" followed by number */
331
if ( name[1] == '#' )
342
c = 0; /* zero on missing/bad number */
333
uint c = 0; /* zero on missing/bad number */
334
Bool isXml = ( (versions & VERS_XML) == VERS_XML );
344
336
/* 'x' prefix denotes hexadecimal number format */
345
if (name[2] == 'x' || (!XmlTags && name[2] == 'X')) /* #427833 - fix by Bj�rn H�hrmann 05 Jun 01 */
346
sscanf(name+3, "%x", &c);
337
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
338
sscanf( name+3, "%x", &c );
348
sscanf(name+2, "%d", &c);
340
sscanf( name+2, "%u", &c );
353
345
/* Named entity: name ="&" followed by a name */
354
if ((np = lookup(name+1)))
346
if ( NULL != (np = lookup(name+1)) )
348
/* Only recognize entity name if version supports it. */
349
if ( np->versions & versions )
357
353
return 0; /* zero signifies unknown entity name */
360
void InitEntities(void)
356
Bool EntityInfo( ctmbstr name, Bool isXml, uint* code, uint* versions )
364
for(ep = entities; ep->name != null; ++ep)
365
install(ep->name, ep->code);
359
assert( name && name[0] == '&' );
360
assert( code != NULL );
361
assert( versions != NULL );
363
/* numeric entitity: name = "&#" followed by number */
364
if ( name[1] == '#' )
366
uint c = 0; /* zero on missing/bad number */
368
/* 'x' prefix denotes hexadecimal number format */
369
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
370
sscanf( name+3, "%x", &c );
372
sscanf( name+2, "%u", &c );
375
*versions = VERS_ALL;
379
/* Named entity: name ="&" followed by a name */
380
if ( NULL != (np = lookup(name+1)) )
383
*versions = np->versions;
388
*versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
368
void FreeEntities(void)
393
ctmbstr EntityName( uint ch, uint versions )
370
struct nlist *prev, *next;
395
ctmbstr entnam = NULL;
373
for (i = 0; i < HASHSIZE; ++i)
398
for ( ep = entities; ep->name != NULL; ++ep )
400
if ( ep->code == ch )
402
if ( ep->versions & versions )
404
break; /* Found code. Stop search. */
391
char *EntityName(uint n)
395
for(ep = entities; ep->name != null; ++ep)