1
/* lexer.c -- Lexer for html parser
3
(c) 1998-2005 (W3C) MIT, ERCIM, Keio University
4
See tidy.h for the copyright notice.
9
$Date: 2005/03/22 17:36:03 $
15
Given a file stream fp it returns a sequence of tokens.
17
GetToken(fp) gets the next token
18
UngetToken(fp) provides one level undo
20
The tags include an attribute list:
22
- linked list of attribute/value nodes
23
- each node has 2 NULL-terminated strings.
24
- entities are replaced in attribute values
26
white space is compacted if not in preformatted mode
27
If not in preformatted mode then leading white space
28
is discarded and subsequent white space sequences
29
compacted to single space characters.
31
If XmlTags is no then Tag names are folded to upper
32
case and attribute names to lower case.
35
- Doctype subset and marked sections
51
/* swallows closing '>' */
52
static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
54
static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty,
55
Node **asp, Node **php );
57
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
58
Bool *isempty, int *pdelim );
60
static Node *ParseDocTypeDecl(TidyDocImpl* doc);
62
static void AddAttrToList( AttVal** list, AttVal* av );
64
/* used to classify characters for lexical purposes */
65
#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
66
static uint lexmap[128];
68
#define IsValidXMLAttrName(name) IsValidXMLID(name)
69
#define IsValidXMLElemName(name) IsValidXMLID(name)
71
static struct _doctypes
78
} const W3C_Doctypes[] =
80
{ 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, },
81
{ 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, },
82
{ 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, },
83
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, },
84
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, },
85
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, },
86
{ 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" },
87
{ 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" },
88
{ 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" },
89
{ 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" },
90
{ 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" },
91
{ 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" },
92
{ 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" },
93
{ 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" },
94
{ 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" },
95
{ 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" },
96
{ 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" },
98
/* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */
100
{ 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" },
101
{ 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },
104
{ 0, 0, NULL, NULL, NULL }
107
int HTMLVersion(TidyDocImpl* doc)
112
uint vers = doc->lexer->versions;
113
uint dtver = doc->lexer->doctype;
114
TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
115
Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
116
!cfgBool(doc, TidyHtmlOut);
117
Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver;
119
for (i = 0; W3C_Doctypes[i].name; ++i)
121
if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
122
(html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))
125
if (vers & W3C_Doctypes[i].vers &&
126
(W3C_Doctypes[i].score < score || !score))
128
score = W3C_Doctypes[i].score;
134
return W3C_Doctypes[j].vers;
139
ctmbstr GetFPIFromVers(uint vers)
143
for (i = 0; W3C_Doctypes[i].name; ++i)
144
if (W3C_Doctypes[i].vers == vers)
145
return W3C_Doctypes[i].fpi;
150
static ctmbstr GetSIFromVers(uint vers)
154
for (i = 0; W3C_Doctypes[i].name; ++i)
155
if (W3C_Doctypes[i].vers == vers)
156
return W3C_Doctypes[i].si;
161
static ctmbstr GetNameFromVers(uint vers)
165
for (i = 0; W3C_Doctypes[i].name; ++i)
166
if (W3C_Doctypes[i].vers == vers)
167
return W3C_Doctypes[i].name;
172
static uint GetVersFromFPI(ctmbstr fpi)
176
for (i = 0; W3C_Doctypes[i].name; ++i)
177
if (tmbstrcasecmp(W3C_Doctypes[i].fpi, fpi) == 0)
178
return W3C_Doctypes[i].vers;
183
/* everything is allowed in proprietary version of HTML */
184
/* this is handled here rather than in the tag/attr dicts */
185
void ConstrainVersion(TidyDocImpl* doc, uint vers)
187
doc->lexer->versions &= (vers | VERS_PROPRIETARY);
194
return (Bool)(map & white);
197
Bool IsNewline(uint c)
200
return (Bool)(map & newline);
209
return (Bool)(map & digit);
212
Bool IsLetter(uint c)
218
return (Bool)(map & letter);
221
Bool IsNamechar(uint c)
224
return (Bool)(map & namechar);
227
Bool IsXMLLetter(uint c)
229
return ((c >= 0x41 && c <= 0x5a) ||
230
(c >= 0x61 && c <= 0x7a) ||
231
(c >= 0xc0 && c <= 0xd6) ||
232
(c >= 0xd8 && c <= 0xf6) ||
233
(c >= 0xf8 && c <= 0xff) ||
234
(c >= 0x100 && c <= 0x131) ||
235
(c >= 0x134 && c <= 0x13e) ||
236
(c >= 0x141 && c <= 0x148) ||
237
(c >= 0x14a && c <= 0x17e) ||
238
(c >= 0x180 && c <= 0x1c3) ||
239
(c >= 0x1cd && c <= 0x1f0) ||
240
(c >= 0x1f4 && c <= 0x1f5) ||
241
(c >= 0x1fa && c <= 0x217) ||
242
(c >= 0x250 && c <= 0x2a8) ||
243
(c >= 0x2bb && c <= 0x2c1) ||
245
(c >= 0x388 && c <= 0x38a) ||
247
(c >= 0x38e && c <= 0x3a1) ||
248
(c >= 0x3a3 && c <= 0x3ce) ||
249
(c >= 0x3d0 && c <= 0x3d6) ||
254
(c >= 0x3e2 && c <= 0x3f3) ||
255
(c >= 0x401 && c <= 0x40c) ||
256
(c >= 0x40e && c <= 0x44f) ||
257
(c >= 0x451 && c <= 0x45c) ||
258
(c >= 0x45e && c <= 0x481) ||
259
(c >= 0x490 && c <= 0x4c4) ||
260
(c >= 0x4c7 && c <= 0x4c8) ||
261
(c >= 0x4cb && c <= 0x4cc) ||
262
(c >= 0x4d0 && c <= 0x4eb) ||
263
(c >= 0x4ee && c <= 0x4f5) ||
264
(c >= 0x4f8 && c <= 0x4f9) ||
265
(c >= 0x531 && c <= 0x556) ||
267
(c >= 0x561 && c <= 0x586) ||
268
(c >= 0x5d0 && c <= 0x5ea) ||
269
(c >= 0x5f0 && c <= 0x5f2) ||
270
(c >= 0x621 && c <= 0x63a) ||
271
(c >= 0x641 && c <= 0x64a) ||
272
(c >= 0x671 && c <= 0x6b7) ||
273
(c >= 0x6ba && c <= 0x6be) ||
274
(c >= 0x6c0 && c <= 0x6ce) ||
275
(c >= 0x6d0 && c <= 0x6d3) ||
277
(c >= 0x6e5 && c <= 0x6e6) ||
278
(c >= 0x905 && c <= 0x939) ||
280
(c >= 0x958 && c <= 0x961) ||
281
(c >= 0x985 && c <= 0x98c) ||
282
(c >= 0x98f && c <= 0x990) ||
283
(c >= 0x993 && c <= 0x9a8) ||
284
(c >= 0x9aa && c <= 0x9b0) ||
286
(c >= 0x9b6 && c <= 0x9b9) ||
287
(c >= 0x9dc && c <= 0x9dd) ||
288
(c >= 0x9df && c <= 0x9e1) ||
289
(c >= 0x9f0 && c <= 0x9f1) ||
290
(c >= 0xa05 && c <= 0xa0a) ||
291
(c >= 0xa0f && c <= 0xa10) ||
292
(c >= 0xa13 && c <= 0xa28) ||
293
(c >= 0xa2a && c <= 0xa30) ||
294
(c >= 0xa32 && c <= 0xa33) ||
295
(c >= 0xa35 && c <= 0xa36) ||
296
(c >= 0xa38 && c <= 0xa39) ||
297
(c >= 0xa59 && c <= 0xa5c) ||
299
(c >= 0xa72 && c <= 0xa74) ||
300
(c >= 0xa85 && c <= 0xa8b) ||
302
(c >= 0xa8f && c <= 0xa91) ||
303
(c >= 0xa93 && c <= 0xaa8) ||
304
(c >= 0xaaa && c <= 0xab0) ||
305
(c >= 0xab2 && c <= 0xab3) ||
306
(c >= 0xab5 && c <= 0xab9) ||
309
(c >= 0xb05 && c <= 0xb0c) ||
310
(c >= 0xb0f && c <= 0xb10) ||
311
(c >= 0xb13 && c <= 0xb28) ||
312
(c >= 0xb2a && c <= 0xb30) ||
313
(c >= 0xb32 && c <= 0xb33) ||
314
(c >= 0xb36 && c <= 0xb39) ||
316
(c >= 0xb5c && c <= 0xb5d) ||
317
(c >= 0xb5f && c <= 0xb61) ||
318
(c >= 0xb85 && c <= 0xb8a) ||
319
(c >= 0xb8e && c <= 0xb90) ||
320
(c >= 0xb92 && c <= 0xb95) ||
321
(c >= 0xb99 && c <= 0xb9a) ||
323
(c >= 0xb9e && c <= 0xb9f) ||
324
(c >= 0xba3 && c <= 0xba4) ||
325
(c >= 0xba8 && c <= 0xbaa) ||
326
(c >= 0xbae && c <= 0xbb5) ||
327
(c >= 0xbb7 && c <= 0xbb9) ||
328
(c >= 0xc05 && c <= 0xc0c) ||
329
(c >= 0xc0e && c <= 0xc10) ||
330
(c >= 0xc12 && c <= 0xc28) ||
331
(c >= 0xc2a && c <= 0xc33) ||
332
(c >= 0xc35 && c <= 0xc39) ||
333
(c >= 0xc60 && c <= 0xc61) ||
334
(c >= 0xc85 && c <= 0xc8c) ||
335
(c >= 0xc8e && c <= 0xc90) ||
336
(c >= 0xc92 && c <= 0xca8) ||
337
(c >= 0xcaa && c <= 0xcb3) ||
338
(c >= 0xcb5 && c <= 0xcb9) ||
340
(c >= 0xce0 && c <= 0xce1) ||
341
(c >= 0xd05 && c <= 0xd0c) ||
342
(c >= 0xd0e && c <= 0xd10) ||
343
(c >= 0xd12 && c <= 0xd28) ||
344
(c >= 0xd2a && c <= 0xd39) ||
345
(c >= 0xd60 && c <= 0xd61) ||
346
(c >= 0xe01 && c <= 0xe2e) ||
348
(c >= 0xe32 && c <= 0xe33) ||
349
(c >= 0xe40 && c <= 0xe45) ||
350
(c >= 0xe81 && c <= 0xe82) ||
352
(c >= 0xe87 && c <= 0xe88) ||
355
(c >= 0xe94 && c <= 0xe97) ||
356
(c >= 0xe99 && c <= 0xe9f) ||
357
(c >= 0xea1 && c <= 0xea3) ||
360
(c >= 0xeaa && c <= 0xeab) ||
361
(c >= 0xead && c <= 0xeae) ||
363
(c >= 0xeb2 && c <= 0xeb3) ||
365
(c >= 0xec0 && c <= 0xec4) ||
366
(c >= 0xf40 && c <= 0xf47) ||
367
(c >= 0xf49 && c <= 0xf69) ||
368
(c >= 0x10a0 && c <= 0x10c5) ||
369
(c >= 0x10d0 && c <= 0x10f6) ||
371
(c >= 0x1102 && c <= 0x1103) ||
372
(c >= 0x1105 && c <= 0x1107) ||
374
(c >= 0x110b && c <= 0x110c) ||
375
(c >= 0x110e && c <= 0x1112) ||
382
(c >= 0x1154 && c <= 0x1155) ||
384
(c >= 0x115f && c <= 0x1161) ||
389
(c >= 0x116d && c <= 0x116e) ||
390
(c >= 0x1172 && c <= 0x1173) ||
395
(c >= 0x11ae && c <= 0x11af) ||
396
(c >= 0x11b7 && c <= 0x11b8) ||
398
(c >= 0x11bc && c <= 0x11c2) ||
402
(c >= 0x1e00 && c <= 0x1e9b) ||
403
(c >= 0x1ea0 && c <= 0x1ef9) ||
404
(c >= 0x1f00 && c <= 0x1f15) ||
405
(c >= 0x1f18 && c <= 0x1f1d) ||
406
(c >= 0x1f20 && c <= 0x1f45) ||
407
(c >= 0x1f48 && c <= 0x1f4d) ||
408
(c >= 0x1f50 && c <= 0x1f57) ||
412
(c >= 0x1f5f && c <= 0x1f7d) ||
413
(c >= 0x1f80 && c <= 0x1fb4) ||
414
(c >= 0x1fb6 && c <= 0x1fbc) ||
416
(c >= 0x1fc2 && c <= 0x1fc4) ||
417
(c >= 0x1fc6 && c <= 0x1fcc) ||
418
(c >= 0x1fd0 && c <= 0x1fd3) ||
419
(c >= 0x1fd6 && c <= 0x1fdb) ||
420
(c >= 0x1fe0 && c <= 0x1fec) ||
421
(c >= 0x1ff2 && c <= 0x1ff4) ||
422
(c >= 0x1ff6 && c <= 0x1ffc) ||
424
(c >= 0x212a && c <= 0x212b) ||
426
(c >= 0x2180 && c <= 0x2182) ||
427
(c >= 0x3041 && c <= 0x3094) ||
428
(c >= 0x30a1 && c <= 0x30fa) ||
429
(c >= 0x3105 && c <= 0x312c) ||
430
(c >= 0xac00 && c <= 0xd7a3) ||
431
(c >= 0x4e00 && c <= 0x9fa5) ||
433
(c >= 0x3021 && c <= 0x3029) ||
434
(c >= 0x4e00 && c <= 0x9fa5) ||
436
(c >= 0x3021 && c <= 0x3029));
439
Bool IsXMLNamechar(uint c)
441
return (IsXMLLetter(c) ||
442
c == '.' || c == '_' ||
443
c == ':' || c == '-' ||
444
(c >= 0x300 && c <= 0x345) ||
445
(c >= 0x360 && c <= 0x361) ||
446
(c >= 0x483 && c <= 0x486) ||
447
(c >= 0x591 && c <= 0x5a1) ||
448
(c >= 0x5a3 && c <= 0x5b9) ||
449
(c >= 0x5bb && c <= 0x5bd) ||
451
(c >= 0x5c1 && c <= 0x5c2) ||
453
(c >= 0x64b && c <= 0x652) ||
455
(c >= 0x6d6 && c <= 0x6dc) ||
456
(c >= 0x6dd && c <= 0x6df) ||
457
(c >= 0x6e0 && c <= 0x6e4) ||
458
(c >= 0x6e7 && c <= 0x6e8) ||
459
(c >= 0x6ea && c <= 0x6ed) ||
460
(c >= 0x901 && c <= 0x903) ||
462
(c >= 0x93e && c <= 0x94c) ||
464
(c >= 0x951 && c <= 0x954) ||
465
(c >= 0x962 && c <= 0x963) ||
466
(c >= 0x981 && c <= 0x983) ||
470
(c >= 0x9c0 && c <= 0x9c4) ||
471
(c >= 0x9c7 && c <= 0x9c8) ||
472
(c >= 0x9cb && c <= 0x9cd) ||
474
(c >= 0x9e2 && c <= 0x9e3) ||
479
(c >= 0xa40 && c <= 0xa42) ||
480
(c >= 0xa47 && c <= 0xa48) ||
481
(c >= 0xa4b && c <= 0xa4d) ||
482
(c >= 0xa70 && c <= 0xa71) ||
483
(c >= 0xa81 && c <= 0xa83) ||
485
(c >= 0xabe && c <= 0xac5) ||
486
(c >= 0xac7 && c <= 0xac9) ||
487
(c >= 0xacb && c <= 0xacd) ||
488
(c >= 0xb01 && c <= 0xb03) ||
490
(c >= 0xb3e && c <= 0xb43) ||
491
(c >= 0xb47 && c <= 0xb48) ||
492
(c >= 0xb4b && c <= 0xb4d) ||
493
(c >= 0xb56 && c <= 0xb57) ||
494
(c >= 0xb82 && c <= 0xb83) ||
495
(c >= 0xbbe && c <= 0xbc2) ||
496
(c >= 0xbc6 && c <= 0xbc8) ||
497
(c >= 0xbca && c <= 0xbcd) ||
499
(c >= 0xc01 && c <= 0xc03) ||
500
(c >= 0xc3e && c <= 0xc44) ||
501
(c >= 0xc46 && c <= 0xc48) ||
502
(c >= 0xc4a && c <= 0xc4d) ||
503
(c >= 0xc55 && c <= 0xc56) ||
504
(c >= 0xc82 && c <= 0xc83) ||
505
(c >= 0xcbe && c <= 0xcc4) ||
506
(c >= 0xcc6 && c <= 0xcc8) ||
507
(c >= 0xcca && c <= 0xccd) ||
508
(c >= 0xcd5 && c <= 0xcd6) ||
509
(c >= 0xd02 && c <= 0xd03) ||
510
(c >= 0xd3e && c <= 0xd43) ||
511
(c >= 0xd46 && c <= 0xd48) ||
512
(c >= 0xd4a && c <= 0xd4d) ||
515
(c >= 0xe34 && c <= 0xe3a) ||
516
(c >= 0xe47 && c <= 0xe4e) ||
518
(c >= 0xeb4 && c <= 0xeb9) ||
519
(c >= 0xebb && c <= 0xebc) ||
520
(c >= 0xec8 && c <= 0xecd) ||
521
(c >= 0xf18 && c <= 0xf19) ||
527
(c >= 0xf71 && c <= 0xf84) ||
528
(c >= 0xf86 && c <= 0xf8b) ||
529
(c >= 0xf90 && c <= 0xf95) ||
531
(c >= 0xf99 && c <= 0xfad) ||
532
(c >= 0xfb1 && c <= 0xfb7) ||
534
(c >= 0x20d0 && c <= 0x20dc) ||
536
(c >= 0x302a && c <= 0x302f) ||
539
(c >= 0x30 && c <= 0x39) ||
540
(c >= 0x660 && c <= 0x669) ||
541
(c >= 0x6f0 && c <= 0x6f9) ||
542
(c >= 0x966 && c <= 0x96f) ||
543
(c >= 0x9e6 && c <= 0x9ef) ||
544
(c >= 0xa66 && c <= 0xa6f) ||
545
(c >= 0xae6 && c <= 0xaef) ||
546
(c >= 0xb66 && c <= 0xb6f) ||
547
(c >= 0xbe7 && c <= 0xbef) ||
548
(c >= 0xc66 && c <= 0xc6f) ||
549
(c >= 0xce6 && c <= 0xcef) ||
550
(c >= 0xd66 && c <= 0xd6f) ||
551
(c >= 0xe50 && c <= 0xe59) ||
552
(c >= 0xed0 && c <= 0xed9) ||
553
(c >= 0xf20 && c <= 0xf29) ||
562
(c >= 0x3031 && c <= 0x3035) ||
563
(c >= 0x309d && c <= 0x309e) ||
564
(c >= 0x30fc && c <= 0x30fe));
571
return (Bool)(map & lowercase);
578
return (Bool)(map & uppercase);
596
c += (uint) ('A' - 'a' );
601
char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )
603
if ( !cfgBool(doc, TidyXmlTags) )
607
c = (tmbchar) ToUpper(c);
609
else /* force to lower case */
611
c = (tmbchar) ToLower(c);
619
return last character in string
620
this is useful when trailing quotemark
621
is missing on an attribute
623
static tmbchar LastChar( tmbstr str )
627
int n = tmbstrlen(str);
634
node->type is one of these:
639
#define StartEndTag 4
642
Lexer* NewLexer( TidyDocImpl* doc )
644
Lexer* lexer = (Lexer*) MemAlloc( sizeof(Lexer) );
648
ClearMemory( lexer, sizeof(Lexer) );
652
lexer->state = LEX_CONTENT;
654
lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
655
lexer->doctype = VERS_UNKNOWN;
656
lexer->root = &doc->root;
661
Bool EndOfInput( TidyDocImpl* doc )
663
assert( doc->docIn != NULL );
664
return ( !doc->docIn->pushed && IsEOF(doc->docIn) );
667
void FreeLexer( TidyDocImpl* doc )
669
Lexer *lexer = doc->lexer;
675
FreeNode( doc, lexer->token );
677
while ( lexer->istacksize > 0 )
678
PopInline( doc, NULL );
680
MemFree( lexer->istack );
681
MemFree( lexer->lexbuf );
687
/* Lexer uses bigger memory chunks than pprint as
688
** it must hold the entire input document. not just
689
** the last line or three.
691
void AddByte( Lexer *lexer, tmbchar ch )
693
if ( lexer->lexsize + 2 >= lexer->lexlength )
696
uint allocAmt = lexer->lexlength;
697
while ( lexer->lexsize + 2 >= allocAmt )
704
buf = (tmbstr) MemRealloc( lexer->lexbuf, allocAmt );
707
ClearMemory( buf + lexer->lexlength,
708
allocAmt - lexer->lexlength );
710
lexer->lexlength = allocAmt;
714
lexer->lexbuf[ lexer->lexsize++ ] = ch;
715
lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */
718
static void ChangeChar( Lexer *lexer, tmbchar c )
720
if ( lexer->lexsize > 0 )
722
lexer->lexbuf[ lexer->lexsize-1 ] = c;
726
/* store character c as UTF-8 encoded byte stream */
727
void AddCharToLexer( Lexer *lexer, uint c )
729
int i, err, count = 0;
730
tmbchar buf[10] = {0};
732
err = EncodeCharToUTF8Bytes( c, buf, NULL, &count );
735
#if 0 && defined(_DEBUG)
736
fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c );
738
/* replacement character 0xFFFD encoded as UTF-8 */
739
buf[0] = (byte) 0xEF;
740
buf[1] = (byte) 0xBF;
741
buf[2] = (byte) 0xBD;
745
for ( i = 0; i < count; ++i )
746
AddByte( lexer, buf[i] );
749
static void AddStringToLexer( Lexer *lexer, ctmbstr str )
753
/* Many (all?) compilers will sign-extend signed chars (the default) when
754
** converting them to unsigned integer values. We must cast our char to
755
** unsigned char before assigning it to prevent this from happening.
757
while( 0 != (c = (unsigned char) *str++ ))
758
AddCharToLexer( lexer, c );
762
No longer attempts to insert missing ';' for unknown
763
enitities unless one was present already, since this
764
gives unexpected results.
766
For example: <a href="something.htm?foo&bar&fred">
767
was tidied to: <a href="something.htm?foo&bar;&fred;">
768
rather than: <a href="something.htm?foo&bar&fred">
770
My thanks for Maurice Buxton for spotting this.
772
Also Randy Waki pointed out the following case for the
773
04 Aug 00 version (bug #433012):
775
For example: <a href="something.htm?id=1&lang=en">
776
was tidied to: <a href="something.htm?id=1⟨=en">
777
rather than: <a href="something.htm?id=1&lang=en">
779
where "lang" is a known entity (#9001), but browsers would
780
misinterpret "⟨" because it had a value > 256.
782
So the case of an apparently known entity with a value > 256 and
783
missing a semicolon is handled specially.
785
"ParseEntity" is also a bit of a misnomer - it handles entities and
786
numeric character references. Invalid NCR's are now reported.
788
static void ParseEntity( TidyDocImpl* doc, int mode )
791
Bool first = yes, semicolon = no, found = no;
792
Bool isXml = cfgBool( doc, TidyXmlTags );
793
uint c, ch, startcol, entver = 0;
794
Lexer* lexer = doc->lexer;
796
start = lexer->lexsize - 1; /* to start at "&" */
797
startcol = doc->docIn->curcol - 1;
799
while ( (c = ReadChar(doc->docIn)) != EndOfStream )
807
if (first && c == '#')
809
#if SUPPORT_ASIAN_ENCODINGS
810
if ( !cfgBool(doc, TidyNCR) ||
811
cfg(doc, TidyInCharEncoding) == BIG5 ||
812
cfg(doc, TidyInCharEncoding) == SHIFTJIS )
814
UngetChar('#', doc->docIn);
818
AddCharToLexer( lexer, c );
827
AddCharToLexer( lexer, c );
831
/* otherwise put it back */
833
UngetChar( c, doc->docIn );
837
/* make sure entity is NULL terminated */
838
lexer->lexbuf[lexer->lexsize] = '\0';
840
/* Should contrain version to XML/XHTML if '
841
** is encountered. But this is not possible with
842
** Tidy's content model bit mask.
844
if ( tmbstrcmp(lexer->lexbuf+start, "&apos") == 0
845
&& !cfgBool(doc, TidyXmlOut)
847
&& !cfgBool(doc, TidyXhtmlOut) )
848
ReportEntityError( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
850
/* Lookup entity code and version
852
found = EntityInfo( lexer->lexbuf+start, isXml, &ch, &entver );
854
/* deal with unrecognized or invalid entities */
855
/* #433012 - fix by Randy Waki 17 Feb 01 */
856
/* report invalid NCR's - Terry Teague 01 Sep 01 */
857
if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
859
/* set error position just before offending character */
860
lexer->lines = doc->docIn->curline;
861
lexer->columns = startcol;
863
if (lexer->lexsize > start + 1)
865
if (ch >= 128 && ch <= 159)
867
/* invalid numeric character reference */
870
int replaceMode = DISCARDED_CHAR;
872
if ( ReplacementCharEncoding == WIN1252 )
873
c1 = DecodeWin1252( ch );
874
else if ( ReplacementCharEncoding == MACROMAN )
875
c1 = DecodeMacRoman( ch );
878
replaceMode = REPLACED_CHAR;
880
if ( c != ';' ) /* issue warning if not terminated by ';' */
881
ReportEntityError( doc, MISSING_SEMICOLON_NCR,
882
lexer->lexbuf+start, c );
884
ReportEncodingError(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
888
/* make the replacement */
889
lexer->lexsize = start;
890
AddCharToLexer( lexer, c1 );
896
lexer->lexsize = start;
902
ReportEntityError( doc, UNKNOWN_ENTITY,
903
lexer->lexbuf+start, ch );
906
AddCharToLexer( lexer, ';' );
909
ReportEntityError( doc, UNESCAPED_AMPERSAND,
910
lexer->lexbuf+start, ch );
914
if ( c != ';' ) /* issue warning if not terminated by ';' */
916
/* set error position just before offending chararcter */
917
lexer->lines = doc->docIn->curline;
918
lexer->columns = startcol;
919
ReportEntityError( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
922
lexer->lexsize = start;
923
if ( ch == 160 && (mode & Preformatted) )
925
AddCharToLexer( lexer, ch );
927
if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
928
AddStringToLexer( lexer, "amp;" );
930
/* Detect extended vs. basic entities */
931
ConstrainVersion( doc, entver );
935
static tmbchar ParseTagName( TidyDocImpl* doc )
937
Lexer *lexer = doc->lexer;
938
uint c = lexer->lexbuf[ lexer->txtstart ];
939
Bool xml = cfgBool(doc, TidyXmlTags);
941
/* fold case of first character in buffer */
942
if (!xml && IsUpper(c))
943
lexer->lexbuf[lexer->txtstart] = (tmbchar) ToLower(c);
945
while ((c = ReadChar(doc->docIn)) != EndOfStream)
947
if ((!xml && !IsNamechar(c)) ||
948
(xml && !IsXMLNamechar(c)))
951
/* fold case of subsequent characters */
952
if (!xml && IsUpper(c))
955
AddCharToLexer(lexer, c);
958
lexer->txtend = lexer->lexsize;
963
Used for elements and text nodes
964
element name is NULL for text nodes
965
start and end are offsets into lexbuf
966
which contains the textual content of
967
all elements in the parse tree.
969
parent and content allow traversal
970
of the parse tree in any direction.
971
attributes are represented as a linked
972
list of AttVal nodes which hold the
973
strings for attribute/value pairs.
977
Node *NewNode(Lexer *lexer)
979
Node* node = (Node*) MemAlloc( sizeof(Node) );
980
ClearMemory( node, sizeof(Node) );
983
node->line = lexer->lines;
984
node->column = lexer->columns;
986
node->type = TextNode;
990
/* used to clone heading nodes when split by an <HR> */
991
Node *CloneNode( TidyDocImpl* doc, Node *element )
993
Lexer* lexer = doc->lexer;
994
Node *node = NewNode( lexer );
996
node->start = lexer->lexsize;
997
node->end = lexer->lexsize;
1001
node->parent = element->parent;
1002
node->type = element->type;
1003
node->closed = element->closed;
1004
node->implicit = element->implicit;
1005
node->tag = element->tag;
1006
node->element = tmbstrdup( element->element );
1007
node->attributes = DupAttrs( doc, element->attributes );
1012
/* free node's attributes */
1013
void FreeAttrs( TidyDocImpl* doc, Node *node )
1016
while ( node->attributes )
1018
AttVal *av = node->attributes;
1020
if ( av->attribute )
1022
if ( (attrIsID(av) || attrIsNAME(av)) &&
1023
IsAnchorElement(doc, node) )
1025
RemoveAnchorByNode( doc, node );
1029
node->attributes = av->next;
1030
FreeAttribute( doc, av );
1034
/* doesn't repair attribute list linkage */
1035
void FreeAttribute( TidyDocImpl* doc, AttVal *av )
1037
FreeNode( doc, av->asp );
1038
FreeNode( doc, av->php );
1039
MemFree( av->attribute );
1040
MemFree( av->value );
1044
/* detach attribute from node
1046
void DetachAttribute( Node *node, AttVal *attr )
1048
AttVal *av, *prev = NULL;
1050
for ( av = node->attributes; av; av = av->next )
1055
prev->next = attr->next;
1057
node->attributes = attr->next;
1064
/* detach attribute from node then free it
1066
void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr )
1068
DetachAttribute( node, attr );
1069
FreeAttribute( doc, attr );
1073
Free document nodes by iterating through peers and recursing
1074
through children. Set next to NULL before calling FreeNode()
1075
to avoid freeing peer nodes. Doesn't patch up prev/next links.
1077
void FreeNode( TidyDocImpl* doc, Node *node )
1081
Node* next = node->next;
1083
FreeAttrs( doc, node );
1084
FreeNode( doc, node->content );
1085
MemFree( node->element );
1086
#ifdef TIDY_STORE_ORIGINAL_TEXT
1088
MemFree(node->otext);
1090
if (RootNode != node->type)
1093
node->content = NULL;
1099
#ifdef TIDY_STORE_ORIGINAL_TEXT
1100
void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count)
1102
if (!doc->storeText)
1105
if (count >= doc->docIn->otextlen)
1108
if (!doc->docIn->otextsize)
1113
node->otext = doc->docIn->otextbuf;
1114
doc->docIn->otextbuf = NULL;
1115
doc->docIn->otextlen = 0;
1116
doc->docIn->otextsize = 0;
1120
uint len = doc->docIn->otextlen;
1121
tmbstr buf1 = (tmbstr)MemAlloc(len - count + 1);
1122
tmbstr buf2 = (tmbstr)MemAlloc(count + 1);
1127
for (i = 0; i < len - count; ++i)
1128
buf1[i] = doc->docIn->otextbuf[i];
1132
for (j = 0; j + i < len; ++j)
1133
buf2[j] = doc->docIn->otextbuf[j + i];
1137
MemFree(doc->docIn->otextbuf);
1139
doc->docIn->otextbuf = buf2;
1140
doc->docIn->otextlen = count;
1141
doc->docIn->otextsize = count + 1;
1146
Node* TextToken( Lexer *lexer )
1148
Node *node = NewNode( lexer );
1149
node->start = lexer->txtstart;
1150
node->end = lexer->txtend;
1154
/* used for creating preformatted text from Word2000 */
1155
Node *NewLineNode( Lexer *lexer )
1157
Node *node = NewNode( lexer );
1158
node->start = lexer->lexsize;
1159
AddCharToLexer( lexer, (uint)'\n' );
1160
node->end = lexer->lexsize;
1164
/* used for adding a for Word2000 */
1165
Node* NewLiteralTextNode( Lexer *lexer, ctmbstr txt )
1167
Node *node = NewNode( lexer );
1168
node->start = lexer->lexsize;
1169
AddStringToLexer( lexer, txt );
1170
node->end = lexer->lexsize;
1174
static Node* TagToken( TidyDocImpl* doc, NodeType type )
1176
Lexer* lexer = doc->lexer;
1177
Node* node = NewNode( lexer );
1179
node->element = tmbstrndup( lexer->lexbuf + lexer->txtstart,
1180
lexer->txtend - lexer->txtstart );
1181
node->start = lexer->txtstart;
1182
node->end = lexer->txtstart;
1184
if ( type == StartTag || type == StartEndTag || type == EndTag )
1190
static Node* NewToken(TidyDocImpl* doc, NodeType type)
1192
Lexer* lexer = doc->lexer;
1193
Node* node = NewNode(lexer);
1195
node->start = lexer->txtstart;
1196
node->end = lexer->txtend;
1197
#ifdef TIDY_STORE_ORIGINAL_TEXT
1198
StoreOriginalTextInToken(doc, node, 0);
1203
#define CommentToken(doc) NewToken(doc, CommentTag)
1204
#define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1205
#define PIToken(doc) NewToken(doc, ProcInsTag)
1206
#define AspToken(doc) NewToken(doc, AspTag)
1207
#define JsteToken(doc) NewToken(doc, JsteTag)
1208
#define PhpToken(doc) NewToken(doc, PhpTag)
1209
#define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1210
#define SectionToken(doc) NewToken(doc, SectionTag)
1211
#define CDATAToken(doc) NewToken(doc, CDATATag)
1213
void AddStringLiteral( Lexer* lexer, ctmbstr str )
1216
while(0 != (c = *str++) )
1217
AddCharToLexer( lexer, c );
1220
void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1225
for ( ix=0; ix < len && (c = *str++); ++ix )
1226
AddCharToLexer(lexer, c);
1229
/* find doctype element */
1230
Node *FindDocType( TidyDocImpl* doc )
1233
for ( node = (doc ? doc->root.content : NULL);
1234
node && node->type != DocTypeTag;
1240
/* find parent container element */
1241
Node* FindContainer( Node* node )
1243
for ( node = (node ? node->parent : NULL);
1244
node && nodeHasCM(node, CM_INLINE);
1245
node = node->parent )
1252
/* find html element */
1253
Node *FindHTML( TidyDocImpl* doc )
1256
for ( node = (doc ? doc->root.content : NULL);
1257
node && !nodeIsHTML(node);
1264
/* find XML Declaration */
1265
Node *FindXmlDecl(TidyDocImpl* doc)
1268
for ( node = (doc ? doc->root.content : NULL);
1269
node && !(node->type == XmlDecl);
1277
Node *FindHEAD( TidyDocImpl* doc )
1279
Node *node = FindHTML( doc );
1283
for ( node = node->content;
1284
node && !nodeIsHEAD(node);
1292
Node *FindTITLE(TidyDocImpl* doc)
1294
Node *node = FindHEAD(doc);
1297
for (node = node->content;
1298
node && !nodeIsTITLE(node);
1299
node = node->next) {}
1304
Node *FindBody( TidyDocImpl* doc )
1306
Node *node = ( doc ? doc->root.content : NULL );
1308
while ( node && !nodeIsHTML(node) )
1314
node = node->content;
1315
while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1318
if ( node && nodeIsFRAMESET(node) )
1320
node = node->content;
1321
while ( node && !nodeIsNOFRAMES(node) )
1326
node = node->content;
1327
while ( node && !nodeIsBODY(node) )
1335
/* add meta element for Tidy */
1336
Bool AddGenerator( TidyDocImpl* doc )
1340
Node *head = FindHEAD( doc );
1345
#ifdef PLATFORM_NAME
1346
tmbsnprintf(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org",
1349
tmbsnprintf(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate());
1352
for ( node = head->content; node; node = node->next )
1354
if ( nodeIsMETA(node) )
1356
attval = AttrGetById(node, TidyAttr_NAME);
1358
if (AttrValueIs(attval, "generator"))
1360
attval = AttrGetById(node, TidyAttr_CONTENT);
1362
if (AttrHasValue(attval) &&
1363
tmbstrncasecmp(attval->value, "HTML Tidy", 9) == 0)
1365
/* update the existing content to reflect the */
1366
/* actual version of Tidy currently being used */
1368
MemFree(attval->value);
1369
attval->value = tmbstrdup(buf);
1376
if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
1378
node = InferredTag(doc, TidyTag_META);
1379
AddAttribute( doc, node, "name", "generator" );
1380
AddAttribute( doc, node, "content", buf );
1381
InsertNodeAtStart( head, node );
1389
/* examine <!DOCTYPE> to identify version */
1390
uint FindGivenVersion( TidyDocImpl* doc, Node* doctype )
1392
AttVal * fpi = GetAttrByName(doctype, "PUBLIC");
1395
if (!fpi || !fpi->value)
1396
return VERS_UNKNOWN;
1398
vers = GetVersFromFPI(fpi->value);
1400
if (VERS_XHTML & vers)
1402
SetOptionBool(doc, TidyXmlOut, yes);
1403
SetOptionBool(doc, TidyXhtmlOut, yes);
1404
doc->lexer->isvoyager = yes;
1407
/* todo: add a warning if case does not match? */
1408
MemFree(fpi->value);
1409
fpi->value = tmbstrdup(GetFPIFromVers(vers));
1414
/* return guessed version */
1415
uint ApparentVersion( TidyDocImpl* doc )
1417
if ((doc->lexer->doctype == XH11 ||
1418
doc->lexer->doctype == XB10) &&
1419
(doc->lexer->versions & doc->lexer->doctype))
1420
return doc->lexer->doctype;
1422
return HTMLVersion(doc);
1425
ctmbstr HTMLVersionNameFromCode( uint vers, Bool ARG_UNUSED(isXhtml) )
1427
ctmbstr name = GetNameFromVers(vers);
1429
/* this test has moved to ReportMarkupVersion() in localize.c, for localization reasons */
1432
name = "HTML Proprietary";
1438
/* Put DOCTYPE declaration between the
1439
** <?xml version "1.0" ... ?> declaration, if any,
1440
** and the <html> tag. Should also work for any comments,
1441
** etc. that may precede the <html> tag.
1444
static Node* NewDocTypeNode( TidyDocImpl* doc )
1446
Node* doctype = NULL;
1447
Node* html = FindHTML( doc );
1448
Node* root = &doc->root;
1452
doctype = NewNode( NULL );
1453
doctype->type = DocTypeTag;
1454
doctype->next = html;
1455
doctype->parent = root;
1457
if ( html == root->content )
1459
/* No <?xml ... ?> declaration. */
1460
root->content->prev = doctype;
1461
root->content = doctype;
1462
doctype->prev = NULL;
1466
/* we have an <?xml ... ?> declaration. */
1467
doctype->prev = html->prev;
1468
doctype->prev->next = doctype;
1470
html->prev = doctype;
1474
Bool SetXHTMLDocType( TidyDocImpl* doc )
1476
Lexer *lexer = doc->lexer;
1477
Node *doctype = FindDocType( doc );
1478
TidyDoctypeModes dtmode = cfg(doc, TidyDoctypeMode);
1479
ctmbstr pub = "PUBLIC";
1480
ctmbstr sys = "SYSTEM";
1482
lexer->versionEmitted = ApparentVersion( doc );
1484
if (dtmode == TidyDoctypeOmit)
1487
DiscardElement(doc, doctype);
1491
if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype))
1496
doctype = NewDocTypeNode(doc);
1497
doctype->element = tmbstrdup("html");
1501
doctype->element = tmbstrtolower(doctype->element);
1506
case TidyDoctypeStrict:
1507
/* XHTML 1.0 Strict */
1508
RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10S));
1509
RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10S));
1510
lexer->versionEmitted = X10S;
1512
case TidyDoctypeLoose:
1513
/* XHTML 1.0 Transitional */
1514
RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10T));
1515
RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10T));
1516
lexer->versionEmitted = X10T;
1518
case TidyDoctypeUser:
1519
/* user defined document type declaration */
1520
RepairAttrValue(doc, doctype, pub, cfgStr(doc, TidyDoctype));
1521
RepairAttrValue(doc, doctype, sys, "");
1523
case TidyDoctypeAuto:
1524
if (lexer->versions & XH11 && lexer->doctype == XH11)
1526
if (!GetAttrByName(doctype, sys))
1527
RepairAttrValue(doc, doctype, sys, GetSIFromVers(XH11));
1528
lexer->versionEmitted = XH11;
1531
else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40))
1533
RepairAttrValue(doc, doctype, pub, GetFPIFromVers(XH11));
1534
RepairAttrValue(doc, doctype, sys, GetSIFromVers(XH11));
1535
lexer->versionEmitted = XH11;
1537
else if (lexer->versions & XB10 && lexer->doctype == XB10)
1539
if (!GetAttrByName(doctype, sys))
1540
RepairAttrValue(doc, doctype, sys, GetSIFromVers(XB10));
1541
lexer->versionEmitted = XB10;
1544
else if (lexer->versions & VERS_HTML40_STRICT)
1546
RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10S));
1547
RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10S));
1548
lexer->versionEmitted = X10S;
1550
else if (lexer->versions & VERS_FRAMESET)
1552
RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10F));
1553
RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10F));
1554
lexer->versionEmitted = X10F;
1556
else if (lexer->versions & VERS_LOOSE)
1558
RepairAttrValue(doc, doctype, pub, GetFPIFromVers(X10T));
1559
RepairAttrValue(doc, doctype, sys, GetSIFromVers(X10T));
1560
lexer->versionEmitted = X10T;
1565
DiscardElement(doc, doctype);
1574
/* fixup doctype if missing */
1575
Bool FixDocType( TidyDocImpl* doc )
1577
Lexer* lexer = doc->lexer;
1578
Node* doctype = FindDocType( doc );
1579
uint dtmode = cfg( doc, TidyDoctypeMode );
1580
uint guessed = VERS_UNKNOWN;
1583
if (dtmode == TidyDoctypeAuto &&
1584
lexer->versions & lexer->doctype &&
1585
!(VERS_XHTML & lexer->doctype && !lexer->isvoyager)
1586
&& FindDocType(doc))
1588
lexer->versionEmitted = lexer->doctype;
1592
if (dtmode == TidyDoctypeOmit)
1595
DiscardElement( doc, doctype );
1596
lexer->versionEmitted = ApparentVersion( doc );
1600
if (cfgBool(doc, TidyXmlOut))
1604
hadSI = GetAttrByName(doctype, "SYSTEM") != NULL;
1606
if ((dtmode == TidyDoctypeStrict ||
1607
dtmode == TidyDoctypeLoose) && doctype)
1609
DiscardElement(doc, doctype);
1615
case TidyDoctypeStrict:
1618
case TidyDoctypeLoose:
1621
case TidyDoctypeAuto:
1622
guessed = HTMLVersion(doc);
1626
lexer->versionEmitted = guessed;
1627
if (guessed == VERS_UNKNOWN)
1632
doctype->element = tmbstrtolower(doctype->element);
1636
doctype = NewDocTypeNode(doc);
1637
doctype->element = tmbstrdup("html");
1640
RepairAttrValue(doc, doctype, "PUBLIC", GetFPIFromVers(guessed));
1643
RepairAttrValue(doc, doctype, "SYSTEM", GetSIFromVers(guessed));
1648
/* ensure XML document starts with <?xml version="1.0"?> */
1649
/* add encoding attribute if not using ASCII or UTF-8 output */
1650
Bool FixXmlDecl( TidyDocImpl* doc )
1653
AttVal *version, *encoding;
1654
Lexer*lexer = doc->lexer;
1655
Node* root = &doc->root;
1657
if ( root->content && root->content->type == XmlDecl )
1659
xml = root->content;
1663
xml = NewNode(lexer);
1664
xml->type = XmlDecl;
1665
xml->next = root->content;
1667
if ( root->content )
1669
root->content->prev = xml;
1670
xml->next = root->content;
1673
root->content = xml;
1676
version = GetAttrByName(xml, "version");
1677
encoding = GetAttrByName(xml, "encoding");
1680
We need to insert a check if declared encoding
1681
and output encoding mismatch and fix the XML
1682
declaration accordingly!!!
1685
if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 )
1687
ctmbstr enc = GetEncodingNameFromTidyId(cfg(doc, TidyOutCharEncoding));
1689
AddAttribute( doc, xml, "encoding", enc );
1692
if ( version == NULL )
1693
AddAttribute( doc, xml, "version", "1.0" );
1697
Node* InferredTag(TidyDocImpl* doc, TidyTagId id)
1699
Lexer *lexer = doc->lexer;
1700
Node *node = NewNode( lexer );
1701
const Dict* dict = LookupTagDef(id);
1703
assert( dict != NULL );
1705
node->type = StartTag;
1706
node->implicit = yes;
1707
node->element = tmbstrdup(dict->name);
1709
node->start = lexer->txtstart;
1710
node->end = lexer->txtend;
1715
Bool ExpectsContent(Node *node)
1717
if (node->type != StartTag)
1720
/* unknown element? */
1721
if (node->tag == NULL)
1724
if (node->tag->model & CM_EMPTY)
1731
create a text node for the contents of
1732
a CDATA element like style or script
1733
which ends with </foo> for some foo.
1736
#define CDATA_INTERMEDIATE 1
1737
#define CDATA_STARTTAG 2
1738
#define CDATA_ENDTAG 3
1740
Node *GetCDATA( TidyDocImpl* doc, Node *container )
1742
Lexer* lexer = doc->lexer;
1745
int state = CDATA_INTERMEDIATE;
1750
Bool hasSrc = AttrGetById(container, TidyAttr_SRC) != NULL;
1752
lexer->lines = doc->docIn->curline;
1753
lexer->columns = doc->docIn->curcol;
1754
lexer->waswhite = no;
1755
lexer->txtstart = lexer->txtend = lexer->lexsize;
1757
/* seen start tag, look for matching end tag */
1758
while ((c = ReadChar(doc->docIn)) != EndOfStream)
1760
AddCharToLexer(lexer, c);
1761
lexer->txtend = lexer->lexsize;
1763
if (state == CDATA_INTERMEDIATE)
1767
if (isEmpty && !IsWhite(c))
1772
c = ReadChar(doc->docIn);
1776
/* <head><script src=foo><meta name=foo content=bar>*/
1777
if (hasSrc && isEmpty && nodeIsSCRIPT(container))
1779
/* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
1780
lexer->lexsize = lexer->txtstart;
1781
UngetChar(c, doc->docIn);
1782
UngetChar('<', doc->docIn);
1785
AddCharToLexer(lexer, c);
1786
start = lexer->lexsize - 1;
1787
state = CDATA_STARTTAG;
1791
AddCharToLexer(lexer, c);
1793
c = ReadChar(doc->docIn);
1797
UngetChar(c, doc->docIn);
1800
UngetChar(c, doc->docIn);
1802
start = lexer->lexsize;
1803
state = CDATA_ENDTAG;
1807
/* recognize document.write("<script><\/script>") */
1808
AddCharToLexer(lexer, c);
1810
c = ReadChar(doc->docIn);
1814
UngetChar(c, doc->docIn);
1818
AddCharToLexer(lexer, c);
1819
c = ReadChar(doc->docIn);
1823
UngetChar(c, doc->docIn);
1826
UngetChar(c, doc->docIn);
1828
start = lexer->lexsize;
1829
state = CDATA_ENDTAG;
1833
UngetChar(c, doc->docIn);
1836
/* '<' + Letter found */
1837
else if (state == CDATA_STARTTAG)
1842
matches = tmbstrncasecmp(container->element, lexer->lexbuf + start,
1843
tmbstrlen(container->element)) == 0;
1847
state = CDATA_INTERMEDIATE;
1849
/* '<' + '/' + Letter found */
1850
else if (state == CDATA_ENDTAG)
1855
matches = tmbstrncasecmp(container->element, lexer->lexbuf + start,
1856
tmbstrlen(container->element)) == 0;
1858
if (isEmpty && !matches)
1860
/* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
1862
for (i = lexer->lexsize - 1; i >= start; --i)
1863
UngetChar((uint)lexer->lexbuf[i], doc->docIn);
1864
UngetChar('/', doc->docIn);
1865
UngetChar('<', doc->docIn);
1869
if (matches && nested-- <= 0)
1871
for (i = lexer->lexsize - 1; i >= start; --i)
1872
UngetChar((uint)lexer->lexbuf[i], doc->docIn);
1873
UngetChar('/', doc->docIn);
1874
UngetChar('<', doc->docIn);
1875
lexer->lexsize -= (lexer->lexsize - start) + 2;
1878
else if (lexer->lexbuf[start - 2] != '\\')
1880
/* if the end tag is not already escaped using backslash */
1881
lexer->lines = doc->docIn->curline;
1882
lexer->columns = doc->docIn->curcol - 3;
1883
ReportError(doc, NULL, NULL, BAD_CDATA_CONTENT);
1885
/* if javascript insert backslash before / */
1886
if (IsJavaScript(container))
1888
for (i = lexer->lexsize; i > start-1; --i)
1889
lexer->lexbuf[i] = lexer->lexbuf[i-1];
1891
lexer->lexbuf[start-1] = '\\';
1895
state = CDATA_INTERMEDIATE;
1899
lexer->lexsize = lexer->txtstart = lexer->txtend;
1901
lexer->txtend = lexer->lexsize;
1903
if (c == EndOfStream)
1904
ReportError(doc, container, NULL, MISSING_ENDTAG_FOR );
1906
/* if (lexer->txtend > lexer->txtstart) */
1907
return TextToken(lexer);
1912
void UngetToken( TidyDocImpl* doc )
1914
doc->lexer->pushed = yes;
1917
#ifdef TIDY_STORE_ORIGINAL_TEXT
1918
#define CondReturnTextNode(doc, skip) \
1919
if (lexer->txtend > lexer->txtstart) \
1921
lexer->token = TextToken(lexer); \
1922
StoreOriginalTextInToken(doc, lexer->token, skip); \
1923
return lexer->token; \
1926
#define CondReturnTextNode(doc, skip) \
1927
if (lexer->txtend > lexer->txtstart) \
1929
lexer->token = TextToken(lexer); \
1930
return lexer->token; \
1935
modes for GetToken()
1937
MixedContent -- for elements which don't accept PCDATA
1938
Preformatted -- white space preserved as is
1939
IgnoreMarkup -- for CDATA elements such as script, style
1942
Node* GetToken( TidyDocImpl* doc, uint mode )
1944
Lexer* lexer = doc->lexer;
1945
uint c, badcomment = 0;
1947
AttVal *attributes = NULL;
1951
/* duplicate inlines in preference to pushed text nodes when appropriate */
1952
if (lexer->token->type != TextNode || (!lexer->insert && !lexer->inode))
1955
return lexer->token;
1959
/* at start of block elements, unclosed inline
1960
elements are inserted into the token stream */
1962
if (lexer->insert || lexer->inode)
1967
FreeNode( doc, lexer->token );
1969
return lexer->token = InsertedToken( doc );
1972
if (mode == CdataContent)
1974
assert( lexer->parent != NULL );
1978
FreeNode( doc, lexer->token );
1980
return lexer->token = GetCDATA(doc, lexer->parent);
1983
lexer->lines = doc->docIn->curline;
1984
lexer->columns = doc->docIn->curcol;
1985
lexer->waswhite = no;
1987
lexer->txtstart = lexer->txtend = lexer->lexsize;
1989
while ((c = ReadChar(doc->docIn)) != EndOfStream)
1991
if (lexer->insertspace && !(mode & IgnoreWhitespace))
1993
AddCharToLexer(lexer, ' ');
1994
lexer->waswhite = yes;
1995
lexer->insertspace = no;
1998
if (c == 160 && (mode & Preformatted))
2001
AddCharToLexer(lexer, c);
2003
switch (lexer->state)
2005
case LEX_CONTENT: /* element content */
2008
Discard white space if appropriate. Its cheaper
2009
to do this here rather than in parser methods
2010
for elements that don't have mixed content.
2012
if (IsWhite(c) && (mode == IgnoreWhitespace)
2013
&& lexer->lexsize == lexer->txtstart + 1)
2016
lexer->waswhite = no;
2017
lexer->lines = doc->docIn->curline;
2018
lexer->columns = doc->docIn->curcol;
2024
lexer->state = LEX_GT;
2030
/* was previous character white? */
2031
if (lexer->waswhite)
2033
if (mode != Preformatted && mode != IgnoreMarkup)
2036
lexer->lines = doc->docIn->curline;
2037
lexer->columns = doc->docIn->curcol;
2040
else /* prev character wasn't white */
2042
lexer->waswhite = yes;
2044
if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
2045
ChangeChar(lexer, ' ');
2050
else if (c == '&' && mode != IgnoreMarkup)
2051
ParseEntity( doc, mode );
2053
/* this is needed to avoid trimming trailing whitespace */
2054
if (mode == IgnoreWhitespace)
2055
mode = MixedContent;
2057
lexer->waswhite = no;
2060
case LEX_GT: /* < */
2062
/* check for endtag */
2065
if ((c = ReadChar(doc->docIn)) == EndOfStream)
2067
UngetChar(c, doc->docIn);
2071
AddCharToLexer(lexer, c);
2075
lexer->lexsize -= 3;
2076
lexer->txtend = lexer->lexsize;
2077
UngetChar(c, doc->docIn);
2078
lexer->state = LEX_ENDTAG;
2079
lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */
2080
doc->docIn->curcol -= 2;
2082
/* if some text before the </ return it now */
2083
if (lexer->txtend > lexer->txtstart)
2085
/* trim space character before end tag */
2086
if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')
2088
lexer->lexsize -= 1;
2089
lexer->txtend = lexer->lexsize;
2091
lexer->token = TextToken(lexer);
2092
#ifdef TIDY_STORE_ORIGINAL_TEXT
2093
StoreOriginalTextInToken(doc, lexer->token, 3);
2095
return lexer->token;
2098
continue; /* no text so keep going */
2101
/* otherwise treat as CDATA */
2102
lexer->waswhite = no;
2103
lexer->state = LEX_CONTENT;
2107
if (mode == IgnoreMarkup)
2109
/* otherwise treat as CDATA */
2110
lexer->waswhite = no;
2111
lexer->state = LEX_CONTENT;
2116
look out for comments, doctype or marked sections
2117
this isn't quite right, but its getting there ...
2121
c = ReadChar(doc->docIn);
2125
c = ReadChar(doc->docIn);
2129
lexer->state = LEX_COMMENT; /* comment */
2130
lexer->lexsize -= 2;
2131
lexer->txtend = lexer->lexsize;
2133
CondReturnTextNode(doc, 4)
2135
lexer->txtstart = lexer->lexsize;
2139
ReportError(doc, NULL, NULL, MALFORMED_COMMENT );
2141
else if (c == 'd' || c == 'D')
2143
/* todo: check for complete "<!DOCTYPE" not just <!D */
2147
lexer->state = LEX_DOCTYPE; /* doctype */
2148
lexer->lexsize -= 2;
2149
lexer->txtend = lexer->lexsize;
2150
mode = IgnoreWhitespace;
2152
/* skip until white space or '>' */
2156
c = ReadChar(doc->docIn);
2159
if (c == EndOfStream || c == '>')
2161
UngetChar(c, doc->docIn);
2169
/* and skip to end of whitespace */
2173
c = ReadChar(doc->docIn);
2176
if (c == EndOfStream || c == '>')
2178
UngetChar(c, doc->docIn);
2186
UngetChar(c, doc->docIn);
2193
CondReturnTextNode(doc, (skip + 3))
2195
lexer->txtstart = lexer->lexsize;
2200
/* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
2201
lexer->lexsize -= 2;
2202
lexer->state = LEX_SECTION;
2203
lexer->txtend = lexer->lexsize;
2205
CondReturnTextNode(doc, 2)
2207
lexer->txtstart = lexer->lexsize;
2213
/* else swallow characters up to and including next '>' */
2214
while ((c = ReadChar(doc->docIn)) != '>')
2216
if (c == EndOfStream)
2218
UngetChar(c, doc->docIn);
2223
lexer->lexsize -= 2;
2224
lexer->lexbuf[lexer->lexsize] = '\0';
2225
lexer->state = LEX_CONTENT;
2230
processing instructions
2235
lexer->lexsize -= 2;
2236
lexer->state = LEX_PROCINSTR;
2237
lexer->txtend = lexer->lexsize;
2239
CondReturnTextNode(doc, 2)
2241
lexer->txtstart = lexer->lexsize;
2245
/* Microsoft ASP's e.g. <% ... server-code ... %> */
2248
lexer->lexsize -= 2;
2249
lexer->state = LEX_ASP;
2250
lexer->txtend = lexer->lexsize;
2252
CondReturnTextNode(doc, 2)
2254
lexer->txtstart = lexer->lexsize;
2258
/* Netscapes JSTE e.g. <# ... server-code ... #> */
2261
lexer->lexsize -= 2;
2262
lexer->state = LEX_JSTE;
2263
lexer->txtend = lexer->lexsize;
2265
CondReturnTextNode(doc, 2)
2267
lexer->txtstart = lexer->lexsize;
2271
/* check for start tag */
2274
UngetChar(c, doc->docIn); /* push back letter */
2275
UngetChar('<', doc->docIn);
2276
--(doc->docIn->curcol);
2277
lexer->lexsize -= 2; /* discard "<" + letter */
2278
lexer->txtend = lexer->lexsize;
2279
lexer->state = LEX_STARTTAG; /* ready to read tag name */
2281
CondReturnTextNode(doc, 2)
2283
/* lexer->txtstart = lexer->lexsize; missing here? */
2284
continue; /* no text so keep going */
2287
/* fix for bug 762102 */
2290
UngetChar(c, doc->docIn);
2294
/* otherwise treat as CDATA */
2295
lexer->state = LEX_CONTENT;
2296
lexer->waswhite = no;
2299
case LEX_ENDTAG: /* </letter */
2300
lexer->txtstart = lexer->lexsize - 1;
2301
doc->docIn->curcol += 2;
2302
c = ParseTagName( doc );
2303
lexer->token = TagToken( doc, EndTag ); /* create endtag token */
2304
lexer->lexsize = lexer->txtend = lexer->txtstart;
2307
while ( c != '>' && c != EndOfStream )
2309
c = ReadChar(doc->docIn);
2312
if (c == EndOfStream)
2314
FreeNode( doc, lexer->token );
2318
lexer->state = LEX_CONTENT;
2319
lexer->waswhite = no;
2320
#ifdef TIDY_STORE_ORIGINAL_TEXT
2321
StoreOriginalTextInToken(doc, lexer->token, 0); /* hmm... */
2323
return lexer->token; /* the endtag token */
2325
case LEX_STARTTAG: /* first letter of tagname */
2326
c = ReadChar(doc->docIn);
2327
ChangeChar(lexer, (tmbchar)c);
2328
lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */
2329
c = ParseTagName( doc );
2332
lexer->token = TagToken( doc, (isempty ? StartEndTag : StartTag) );
2334
/* parse attributes, consuming closing ">" */
2338
UngetChar(c, doc->docIn);
2340
attributes = ParseAttrs( doc, &isempty );
2344
lexer->token->type = StartEndTag;
2346
lexer->token->attributes = attributes;
2347
lexer->lexsize = lexer->txtend = lexer->txtstart;
2349
/* swallow newline following start tag */
2350
/* special check needed for CRLF sequence */
2351
/* this doesn't apply to empty elements */
2352
/* nor to preformatted content that needs escaping */
2354
if ((mode != Preformatted && ExpectsContent(lexer->token))
2355
|| nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
2357
c = ReadChar(doc->docIn);
2359
if (c != '\n' && c != '\f')
2360
UngetChar(c, doc->docIn);
2362
lexer->waswhite = yes; /* to swallow leading whitespace */
2365
lexer->waswhite = no;
2367
lexer->state = LEX_CONTENT;
2368
if (lexer->token->tag == NULL)
2369
ReportFatal( doc, NULL, lexer->token, UNKNOWN_ELEMENT );
2370
else if ( !cfgBool(doc, TidyXmlTags) )
2372
Node* curr = lexer->token;
2373
ConstrainVersion( doc, curr->tag->versions );
2375
if ( curr->tag->versions & VERS_PROPRIETARY )
2377
if ( !cfgBool(doc, TidyMakeClean) ||
2378
( !nodeIsNOBR(curr) && !nodeIsWBR(curr) ) )
2380
ReportError(doc, NULL, curr, PROPRIETARY_ELEMENT );
2382
if ( nodeIsLAYER(curr) )
2383
doc->badLayout |= USING_LAYER;
2384
else if ( nodeIsSPACER(curr) )
2385
doc->badLayout |= USING_SPACER;
2386
else if ( nodeIsNOBR(curr) )
2387
doc->badLayout |= USING_NOBR;
2391
RepairDuplicateAttributes( doc, curr );
2393
#ifdef TIDY_STORE_ORIGINAL_TEXT
2394
StoreOriginalTextInToken(doc, lexer->token, 0);
2396
return lexer->token; /* return start tag */
2398
case LEX_COMMENT: /* seen <!-- so look for --> */
2403
c = ReadChar(doc->docIn);
2404
AddCharToLexer(lexer, c);
2410
c = ReadChar(doc->docIn);
2415
ReportError(doc, NULL, NULL, MALFORMED_COMMENT );
2417
/* do not store closing -- in lexbuf */
2418
lexer->lexsize -= 2;
2419
lexer->txtend = lexer->lexsize;
2420
lexer->lexbuf[lexer->lexsize] = '\0';
2421
lexer->state = LEX_CONTENT;
2422
lexer->waswhite = no;
2423
lexer->token = CommentToken(doc);
2425
/* now look for a line break */
2427
c = ReadChar(doc->docIn);
2430
lexer->token->linebreak = yes;
2432
UngetChar(c, doc->docIn);
2434
return lexer->token;
2437
/* note position of first such error in the comment */
2440
lexer->lines = doc->docIn->curline;
2441
lexer->columns = doc->docIn->curcol - 3;
2446
if ( cfgBool(doc, TidyFixComments) )
2447
lexer->lexbuf[lexer->lexsize - 2] = '=';
2449
AddCharToLexer(lexer, c);
2451
/* if '-' then look for '>' to end the comment */
2455
/* otherwise continue to look for --> */
2456
lexer->lexbuf[lexer->lexsize - 2] = '=';
2459
case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
2461
/* use ParseDocTypeDecl() to tokenize doctype declaration */
2462
UngetChar(c, doc->docIn);
2463
lexer->lexsize -= 1;
2464
lexer->token = ParseDocTypeDecl(doc);
2466
lexer->txtend = lexer->lexsize;
2467
lexer->lexbuf[lexer->lexsize] = '\0';
2468
lexer->state = LEX_CONTENT;
2469
lexer->waswhite = no;
2471
/* make a note of the version named by the 1st doctype */
2472
if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))
2473
lexer->doctype = FindGivenVersion(doc, lexer->token);
2474
return lexer->token;
2476
case LEX_PROCINSTR: /* seen <? so look for '>' */
2477
/* check for PHP preprocessor instructions <?php ... ?> */
2479
if (lexer->lexsize - lexer->txtstart == 3)
2481
if (tmbstrncmp(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)
2483
lexer->state = LEX_PHP;
2488
if (lexer->lexsize - lexer->txtstart == 4)
2490
if (tmbstrncmp(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 &&
2491
IsWhite(lexer->lexbuf[lexer->txtstart + 3]))
2493
lexer->state = LEX_XMLDECL;
2499
if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */
2504
/* now look for '>' */
2505
c = ReadChar(doc->docIn);
2507
if (c == EndOfStream)
2509
ReportError(doc, NULL, NULL, UNEXPECTED_END_OF_FILE );
2510
UngetChar(c, doc->docIn);
2514
AddCharToLexer(lexer, c);
2521
lexer->lexsize -= 1;
2528
for (i = 0; i < lexer->lexsize - lexer->txtstart &&
2529
!IsWhite(lexer->lexbuf[i + lexer->txtstart]); ++i)
2532
closed = lexer->lexbuf[lexer->lexsize - 1] == '?';
2535
lexer->lexsize -= 1;
2537
lexer->txtstart += i;
2538
lexer->txtend = lexer->lexsize;
2539
lexer->lexbuf[lexer->lexsize] = '\0';
2541
lexer->token = PIToken(doc);
2542
lexer->token->closed = closed;
2543
lexer->token->element = tmbstrndup(lexer->lexbuf +
2544
lexer->txtstart - i, i);
2548
lexer->txtend = lexer->lexsize;
2549
lexer->lexbuf[lexer->lexsize] = '\0';
2550
lexer->token = PIToken(doc);
2553
lexer->state = LEX_CONTENT;
2554
lexer->waswhite = no;
2555
return lexer->token;
2557
case LEX_ASP: /* seen <% so look for "%>" */
2561
/* now look for '>' */
2562
c = ReadChar(doc->docIn);
2567
UngetChar(c, doc->docIn);
2571
lexer->lexsize -= 1;
2572
lexer->txtend = lexer->lexsize;
2573
lexer->lexbuf[lexer->lexsize] = '\0';
2574
lexer->state = LEX_CONTENT;
2575
lexer->waswhite = no;
2576
return lexer->token = AspToken(doc);
2578
case LEX_JSTE: /* seen <# so look for "#>" */
2582
/* now look for '>' */
2583
c = ReadChar(doc->docIn);
2588
UngetChar(c, doc->docIn);
2592
lexer->lexsize -= 1;
2593
lexer->txtend = lexer->lexsize;
2594
lexer->lexbuf[lexer->lexsize] = '\0';
2595
lexer->state = LEX_CONTENT;
2596
lexer->waswhite = no;
2597
return lexer->token = JsteToken(doc);
2599
case LEX_PHP: /* seen "<?php" so look for "?>" */
2603
/* now look for '>' */
2604
c = ReadChar(doc->docIn);
2608
UngetChar(c, doc->docIn);
2612
lexer->lexsize -= 1;
2613
lexer->txtend = lexer->lexsize;
2614
lexer->lexbuf[lexer->lexsize] = '\0';
2615
lexer->state = LEX_CONTENT;
2616
lexer->waswhite = no;
2617
return lexer->token = PhpToken(doc);
2619
case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */
2621
if (IsWhite(c) && c != '?')
2624
/* get pseudo-attribute */
2633
UngetChar(c, doc->docIn);
2635
name = ParseAttribute( doc, &isempty, &asp, &php );
2639
/* fix for http://tidy.sf.net/bug/788031 */
2640
lexer->lexsize -= 1;
2641
lexer->txtend = lexer->txtstart;
2642
lexer->lexbuf[lexer->txtend] = '\0';
2643
lexer->state = LEX_CONTENT;
2644
lexer->waswhite = no;
2645
lexer->token = XmlDeclToken(doc);
2646
lexer->token->attributes = attributes;
2647
return lexer->token;
2650
av = NewAttribute();
2651
av->attribute = name;
2652
av->value = ParseValue( doc, name, yes, &isempty, &pdelim );
2654
av->dict = FindAttribute( doc, av );
2656
AddAttrToList( &attributes, av );
2660
/* now look for '>' */
2661
c = ReadChar(doc->docIn);
2665
UngetChar(c, doc->docIn);
2668
lexer->lexsize -= 1;
2669
lexer->txtend = lexer->txtstart;
2670
lexer->lexbuf[lexer->txtend] = '\0';
2671
lexer->state = LEX_CONTENT;
2672
lexer->waswhite = no;
2673
lexer->token = XmlDeclToken(doc);
2674
lexer->token->attributes = attributes;
2675
return lexer->token;
2677
case LEX_SECTION: /* seen "<![" so look for "]>" */
2680
if (lexer->lexsize == (lexer->txtstart + 6) &&
2681
tmbstrncmp(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)
2683
lexer->state = LEX_CDATA;
2684
lexer->lexsize -= 6;
2692
/* now look for '>' */
2693
c = ReadChar(doc->docIn);
2697
UngetChar(c, doc->docIn);
2701
lexer->lexsize -= 1;
2702
lexer->txtend = lexer->lexsize;
2703
lexer->lexbuf[lexer->lexsize] = '\0';
2704
lexer->state = LEX_CONTENT;
2705
lexer->waswhite = no;
2706
return lexer->token = SectionToken(doc);
2708
case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
2712
/* now look for ']' */
2713
c = ReadChar(doc->docIn);
2717
UngetChar(c, doc->docIn);
2721
/* now look for '>' */
2722
c = ReadChar(doc->docIn);
2726
UngetChar(c, doc->docIn);
2730
lexer->lexsize -= 1;
2731
lexer->txtend = lexer->lexsize;
2732
lexer->lexbuf[lexer->lexsize] = '\0';
2733
lexer->state = LEX_CONTENT;
2734
lexer->waswhite = no;
2735
return lexer->token = CDATAToken(doc);
2739
if (lexer->state == LEX_CONTENT) /* text string */
2741
lexer->txtend = lexer->lexsize;
2743
if (lexer->txtend > lexer->txtstart)
2745
UngetChar(c, doc->docIn);
2747
if (lexer->lexbuf[lexer->lexsize - 1] == ' ')
2749
lexer->lexsize -= 1;
2750
lexer->txtend = lexer->lexsize;
2752
lexer->token = TextToken(lexer);
2753
#ifdef TIDY_STORE_ORIGINAL_TEXT
2754
StoreOriginalTextInToken(doc, lexer->token, 0); /* ? */
2756
return lexer->token;
2759
else if (lexer->state == LEX_COMMENT) /* comment */
2761
if (c == EndOfStream)
2762
ReportError(doc, NULL, NULL, MALFORMED_COMMENT );
2764
lexer->txtend = lexer->lexsize;
2765
lexer->lexbuf[lexer->lexsize] = '\0';
2766
lexer->state = LEX_CONTENT;
2767
lexer->waswhite = no;
2768
return lexer->token = CommentToken(doc);
2774
static void MapStr( ctmbstr str, uint code )
2778
uint i = (byte) *str++;
2785
MapStr("\r\n\f", newline|white);
2786
MapStr(" \t", white);
2787
MapStr("-.:_", namechar);
2788
MapStr("0123456789", digit|namechar);
2789
MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);
2790
MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);
2794
parser for ASP within start tags
2796
Some people use ASP for to customize attributes
2797
Tidy isn't really well suited to dealing with ASP
2798
This is a workaround for attributes, but won't
2799
deal with the case where the ASP is used to tailor
2800
the attribute value. Here is an example of a work
2801
around for using ASP in attribute values:
2803
href='<%=rsSchool.Fields("ID").Value%>'
2805
where the ASP that generates the attribute value
2806
is masked from Tidy by the quotemarks.
2810
static Node *ParseAsp( TidyDocImpl* doc )
2812
Lexer* lexer = doc->lexer;
2816
lexer->txtstart = lexer->lexsize;
2820
if ((c = ReadChar(doc->docIn)) == EndOfStream)
2823
AddCharToLexer(lexer, c);
2829
if ((c = ReadChar(doc->docIn)) == EndOfStream)
2832
AddCharToLexer(lexer, c);
2836
lexer->lexsize -= 2;
2841
lexer->txtend = lexer->lexsize;
2842
if (lexer->txtend > lexer->txtstart)
2843
asp = AspToken(doc);
2845
lexer->txtstart = lexer->txtend;
2851
PHP is like ASP but is based upon XML
2852
processing instructions, e.g. <?php ... ?>
2854
static Node *ParsePhp( TidyDocImpl* doc )
2856
Lexer* lexer = doc->lexer;
2860
lexer->txtstart = lexer->lexsize;
2864
if ((c = ReadChar(doc->docIn)) == EndOfStream)
2867
AddCharToLexer(lexer, c);
2873
if ((c = ReadChar(doc->docIn)) == EndOfStream)
2876
AddCharToLexer(lexer, c);
2880
lexer->lexsize -= 2;
2885
lexer->txtend = lexer->lexsize;
2886
if (lexer->txtend > lexer->txtstart)
2887
php = PhpToken(doc);
2889
lexer->txtstart = lexer->txtend;
2893
/* consumes the '>' terminating start tags */
2894
static tmbstr ParseAttribute( TidyDocImpl* doc, Bool *isempty,
2895
Node **asp, Node **php)
2897
Lexer* lexer = doc->lexer;
2902
*asp = NULL; /* clear asp pointer */
2903
*php = NULL; /* clear php pointer */
2905
/* skip white space before the attribute */
2909
c = ReadChar( doc->docIn );
2914
c = ReadChar( doc->docIn );
2922
UngetChar(c, doc->docIn);
2932
c = ReadChar(doc->docIn);
2936
*asp = ParseAsp( doc );
2941
*php = ParsePhp( doc );
2945
UngetChar(c, doc->docIn);
2946
UngetChar('<', doc->docIn);
2947
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT );
2953
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN );
2957
if (c == '"' || c == '\'')
2959
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
2963
if (c == EndOfStream)
2965
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
2966
UngetChar(c, doc->docIn);
2975
start = lexer->lexsize;
2980
/* but push back '=' for parseValue() */
2981
if (c == '=' || c == '>')
2983
UngetChar(c, doc->docIn);
2987
if (c == '<' || c == EndOfStream)
2989
UngetChar(c, doc->docIn);
2993
if (lastc == '-' && (c == '"' || c == '\''))
2997
UngetChar(c, doc->docIn);
3004
/* what should be done about non-namechar characters? */
3005
/* currently these are incorporated into the attr name */
3007
if ( !cfgBool(doc, TidyXmlTags) && IsUpper(c) )
3010
AddCharToLexer( lexer, c );
3012
c = ReadChar(doc->docIn);
3015
/* handle attribute names with multibyte chars */
3016
len = lexer->lexsize - start;
3017
attr = (len > 0 ? tmbstrndup(lexer->lexbuf+start, len) : NULL);
3018
lexer->lexsize = start;
3023
invoked when < is seen in place of attribute value
3024
but terminates on whitespace if not ASP, PHP or Tango
3025
this routine recognizes ' and " quoted strings
3027
static int ParseServerInstruction( TidyDocImpl* doc )
3029
Lexer* lexer = doc->lexer;
3034
c = ReadChar(doc->docIn);
3035
AddCharToLexer(lexer, c);
3037
/* check for ASP, PHP or Tango */
3038
if (c == '%' || c == '?' || c == '@')
3043
c = ReadChar(doc->docIn);
3045
if (c == EndOfStream)
3051
AddCharToLexer(lexer, c);
3053
UngetChar(c, doc->docIn);
3058
/* if not recognized as ASP, PHP or Tango */
3059
/* then also finish value on whitespace */
3066
AddCharToLexer(lexer, c);
3072
c = ReadChar(doc->docIn);
3073
if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3075
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3076
UngetChar(c, doc->docIn);
3079
if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3081
UngetChar(c, doc->docIn);
3082
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT );
3085
AddCharToLexer(lexer, c);
3096
c = ReadChar(doc->docIn);
3097
if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3099
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3100
UngetChar(c, doc->docIn);
3103
if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3105
UngetChar(c, doc->docIn);
3106
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT );
3109
AddCharToLexer(lexer, c);
3118
/* values start with "=" or " = " etc. */
3119
/* doesn't consume the ">" at end of start tag */
3121
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name,
3122
Bool foldCase, Bool *isempty, int *pdelim)
3124
Lexer* lexer = doc->lexer;
3128
uint c, lastc, delim, quotewarning;
3131
delim = (tmbchar) 0;
3135
Henry Zrepa reports that some folk are using the
3136
embed element with script attributes where newlines
3137
are significant and must be preserved
3139
if ( cfgBool(doc, TidyLiteralAttribs) )
3142
/* skip white space before the '=' */
3146
c = ReadChar(doc->docIn);
3148
if (c == EndOfStream)
3150
UngetChar(c, doc->docIn);
3159
c should be '=' if there is a value
3160
other legal possibilities are white
3164
if (c != '=' && c != '"' && c != '\'')
3166
UngetChar(c, doc->docIn);
3170
/* skip white space after '=' */
3174
c = ReadChar(doc->docIn);
3176
if (c == EndOfStream)
3178
UngetChar(c, doc->docIn);
3186
/* check for quote marks */
3188
if (c == '"' || c == '\'')
3192
start = lexer->lexsize;
3193
AddCharToLexer(lexer, c);
3194
*pdelim = ParseServerInstruction( doc );
3195
len = lexer->lexsize - start;
3196
lexer->lexsize = start;
3197
return (len > 0 ? tmbstrndup(lexer->lexbuf+start, len) : NULL);
3200
UngetChar(c, doc->docIn);
3203
and read the value string
3204
check for quote mark if needed
3208
start = lexer->lexsize;
3213
lastc = c; /* track last character */
3214
c = ReadChar(doc->docIn);
3216
if (c == EndOfStream)
3218
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3219
UngetChar(c, doc->docIn);
3223
if (delim == (tmbchar)0)
3227
UngetChar(c, doc->docIn);
3231
if (c == '"' || c == '\'')
3235
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3237
/* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */
3238
/* this doesn't handle <a title=foo"/> which browsers treat as */
3239
/* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */
3241
c = ReadChar(doc->docIn);
3244
AddCharToLexer(lexer, q);
3245
UngetChar(c, doc->docIn);
3250
UngetChar(c, doc->docIn);
3257
UngetChar(c, doc->docIn);
3259
UngetChar(c, doc->docIn);
3260
ReportAttrError( doc, lexer->token, NULL, UNEXPECTED_GT );
3265
For cases like <br clear=all/> need to avoid treating /> as
3266
part of the attribute value, however care is needed to avoid
3267
so treating <a href=http://www.acme.com/> in this way, which
3268
would map the <a> tag to <a href="http://www.acme.com"/>
3272
/* peek ahead in case of /> */
3273
c = ReadChar(doc->docIn);
3275
if ( c == '>' && !IsUrl(doc, name) )
3278
UngetChar(c, doc->docIn);
3282
/* unget peeked character */
3283
UngetChar(c, doc->docIn);
3287
else /* delim is '\'' or '"' */
3292
if (c == '\n' || c == '<' || c == '>')
3301
AddCharToLexer(lexer, c);
3302
ParseEntity( doc, 0 );
3303
if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge)
3304
ChangeChar(lexer, ' ');
3309
kludge for JavaScript attribute values
3310
with line continuations in string literals
3314
c = ReadChar(doc->docIn);
3318
UngetChar(c, doc->docIn);
3330
/* discard line breaks in quoted URLs */
3331
/* #438650 - fix by Randy Waki */
3332
if ( c == '\n' && IsUrl(doc, name) )
3334
/* warn that we discard this newline */
3335
ReportAttrError( doc, lexer->token, NULL, NEWLINE_IN_URI);
3345
else if (foldCase && IsUpper(c))
3348
AddCharToLexer(lexer, c);
3351
if (quotewarning > 10 && seen_gt && munge)
3354
there is almost certainly a missing trailing quote mark
3355
as we have see too many newlines, < or > characters.
3357
an exception is made for Javascript attributes and the
3358
javascript URL scheme which may legitimately include < and >,
3359
and for attributes starting with "<xml " as generated by
3362
if ( !IsScript(doc, name) &&
3363
!(IsUrl(doc, name) && tmbstrncmp(lexer->lexbuf+start, "javascript:", 11) == 0) &&
3364
!(tmbstrncmp(lexer->lexbuf+start, "<xml ", 5) == 0)
3366
ReportFatal( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE );
3369
len = lexer->lexsize - start;
3370
lexer->lexsize = start;
3373
if (len > 0 || delim)
3375
/* ignore leading and trailing white space for all but title, alt, value */
3376
/* and prompts attributes unless --literal-attributes is set to yes */
3377
/* #994841 - Whitespace is removed from value attributes */
3380
tmbstrcasecmp(name, "alt") &&
3381
tmbstrcasecmp(name, "title") &&
3382
tmbstrcasecmp(name, "value") &&
3383
tmbstrcasecmp(name, "prompt"))
3385
while (IsWhite(lexer->lexbuf[start+len-1]))
3388
while (IsWhite(lexer->lexbuf[start]) && start < len)
3395
value = tmbstrndup(lexer->lexbuf + start, len);
3400
/* note delimiter if given */
3401
*pdelim = (delim ? delim : '"');
3406
/* attr must be non-NULL */
3407
Bool IsValidAttrName( ctmbstr attr )
3409
uint i, c = attr[0];
3411
/* first character should be a letter */
3415
/* remaining characters should be namechars */
3416
for( i = 1; i < tmbstrlen(attr); i++)
3429
/* create a new attribute */
3430
AttVal *NewAttribute(void)
3432
AttVal *av = (AttVal*) MemAlloc( sizeof(AttVal) );
3433
ClearMemory( av, sizeof(AttVal) );
3437
/* create a new attribute with given name and value */
3438
AttVal* NewAttributeEx( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
3441
AttVal *av = NewAttribute();
3442
av->attribute = tmbstrdup(name);
3443
av->value = tmbstrdup(value);
3445
av->dict = FindAttribute( doc, av );
3449
static void AddAttrToList( AttVal** list, AttVal* av )
3451
if ( *list == NULL )
3455
AttVal* here = *list;
3456
while ( here->next )
3462
void InsertAttributeAtEnd( Node *node, AttVal *av )
3464
AddAttrToList(&node->attributes, av);
3467
void InsertAttributeAtStart( Node *node, AttVal *av )
3469
av->next = node->attributes;
3470
node->attributes = av;
3473
/* swallows closing '>' */
3475
static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty )
3477
Lexer* lexer = doc->lexer;
3485
while ( !EndOfInput(doc) )
3487
tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php );
3489
if (attribute == NULL)
3491
/* check if attributes are created by ASP markup */
3494
av = NewAttribute();
3496
AddAttrToList( &list, av );
3500
/* check if attributes are created by PHP markup */
3503
av = NewAttribute();
3505
AddAttrToList( &list, av );
3512
value = ParseValue( doc, attribute, no, isempty, &delim );
3514
if (attribute && (IsValidAttrName(attribute) ||
3515
(cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute))))
3517
av = NewAttribute();
3519
av->attribute = attribute;
3521
av->dict = FindAttribute( doc, av );
3522
AddAttrToList( &list, av );
3526
av = NewAttribute();
3527
av->attribute = attribute;
3530
if (LastChar(attribute) == '"')
3531
ReportAttrError( doc, lexer->token, av, MISSING_QUOTEMARK);
3532
else if (value == NULL)
3533
ReportAttrError(doc, lexer->token, av, MISSING_ATTR_VALUE);
3535
ReportAttrError(doc, lexer->token, av, INVALID_ATTRIBUTE);
3537
FreeAttribute( doc, av );
3545
Returns document type declarations like
3547
<!DOCTYPE foo PUBLIC "fpi" "sysid">
3548
<!DOCTYPE bar SYSTEM "sysid">
3549
<!DOCTYPE baz [ <!ENTITY ouml "ö"> ]>
3553
<foo PUBLIC="fpi" SYSTEM="sysid" />
3554
<bar SYSTEM="sysid" />
3555
<baz> <!ENTITY ouml "&#246"> </baz>
3557
static Node *ParseDocTypeDecl(TidyDocImpl* doc)
3559
Lexer *lexer = doc->lexer;
3560
int start = lexer->lexsize;
3561
ParseDocTypeDeclState state = DT_DOCTYPENAME;
3566
Node* node = NewNode(lexer);
3567
node->type = DocTypeTag;
3568
node->start = lexer->txtstart;
3569
node->end = lexer->txtend;
3571
lexer->waswhite = no;
3573
/* todo: reset lexer->lexsize when appropriate to avoid wasting memory */
3575
while ((c = ReadChar(doc->docIn)) != EndOfStream)
3577
/* convert newlines to spaces */
3578
if (state != DT_INTSUBSET)
3579
c = c == '\n' ? ' ' : c;
3581
/* convert white-space sequences to single space character */
3582
if (IsWhite(c) && state != DT_INTSUBSET)
3584
if (!lexer->waswhite)
3586
AddCharToLexer(lexer, c);
3587
lexer->waswhite = yes;
3597
AddCharToLexer(lexer, c);
3598
lexer->waswhite = no;
3603
case DT_INTERMEDIATE:
3604
/* determine what's next */
3605
if (ToUpper(c) == 'P' || ToUpper(c) == 'S')
3607
start = lexer->lexsize - 1;
3608
state = DT_PUBLICSYSTEM;
3613
start = lexer->lexsize;
3614
state = DT_INTSUBSET;
3617
else if (c == '\'' || c == '"')
3619
start = lexer->lexsize;
3621
state = DT_QUOTEDSTRING;
3628
node->end = --(lexer->lexsize);
3630
si = GetAttrByName(node, "SYSTEM");
3632
CheckUrl(doc, node, si);
3634
if (!node->element || !IsValidXMLElemName(node->element))
3636
ReportError(doc, NULL, NULL, MALFORMED_DOCTYPE);
3637
FreeNode(doc, node);
3640
#ifdef TIDY_STORE_ORIGINAL_TEXT
3641
StoreOriginalTextInToken(doc, node, 0);
3650
case DT_DOCTYPENAME:
3651
/* read document type name */
3652
if (IsWhite(c) || c == '>' || c == '[')
3654
node->element = tmbstrndup(lexer->lexbuf + start,
3655
lexer->lexsize - start - 1);
3656
if (c == '>' || c == '[')
3659
UngetChar(c, doc->docIn);
3662
state = DT_INTERMEDIATE;
3666
case DT_PUBLICSYSTEM:
3667
/* read PUBLIC/SYSTEM */
3668
if (IsWhite(c) || c == '>')
3670
char *attname = tmbstrndup(lexer->lexbuf + start,
3671
lexer->lexsize - start - 1);
3672
hasfpi = !(tmbstrcasecmp(attname, "SYSTEM") == 0);
3676
/* todo: report an error if SYSTEM/PUBLIC not uppercase */
3681
UngetChar(c, doc->docIn);
3684
state = DT_INTERMEDIATE;
3688
case DT_QUOTEDSTRING:
3689
/* read quoted string */
3692
char *value = tmbstrndup(lexer->lexbuf + start,
3693
lexer->lexsize - start - 1);
3694
AttVal* att = AddAttribute(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value);
3698
state = DT_INTERMEDIATE;
3704
/* read internal subset */
3708
lexer->txtstart = start;
3709
lexer->txtend = lexer->lexsize - 1;
3710
subset = TextToken(lexer);
3711
InsertNodeAtEnd(node, subset);
3712
state = DT_INTERMEDIATE;
3718
/* document type declaration not finished */
3719
ReportError(doc, NULL, NULL, MALFORMED_DOCTYPE);
3720
FreeNode(doc, node);