2
tidy.c - HTML parser and pretty printer
4
Copyright (c) 1998-2001 World Wide Web Consortium
5
(Massachusetts Institute of Technology, Institut National de
6
Recherche en Informatique et en Automatique, Keio University).
11
$Author: terry_teague $
12
$Date: 2002/01/10 08:57:17 $
15
Contributing Author(s):
17
Dave Raggett <dsr@w3.org>
19
The contributing author(s) would like to thank all those who
20
helped with testing, bug fixes and suggestions for improvements.
21
This wouldn't have been possible without your help.
25
This software and documentation is provided "as is," and
26
the copyright holders and contributing author(s) make no
27
representations or warranties, express or implied, including
28
but not limited to, warranties of merchantability or fitness
29
for any particular purpose or that the use of the software or
30
documentation will not infringe any third party patents,
31
copyrights, trademarks or other rights.
33
The copyright holders and contributing author(s) will not be held
34
liable for any direct, indirect, special or consequential damages
35
arising out of any use of the software or documentation, even if
36
advised of the possibility of such damage.
38
Permission is hereby granted to use, copy, modify, and distribute
39
this source code, or portions hereof, documentation and executables,
40
for any purpose, without fee, subject to the following restrictions:
42
1. The origin of this source code must not be misrepresented.
43
2. Altered versions must be plainly marked as such and must
44
not be misrepresented as being the original source.
45
3. This Copyright notice may not be removed or altered from any
46
source or altered source distribution.
48
The copyright holders and contributing author(s) specifically
49
permit, without fee, and encourage the use of this source code
50
as a component for supporting the Hypertext Markup Language in
51
commercial products. If you use this source code in a product,
52
acknowledgment is not required but would be appreciated.
58
void DeInitTidy(void);
60
extern char *release_date;
63
Node *debug_element = null;
64
Lexer *debug_lexer = null;
66
uint totalwarnings = 0;
67
uint optionerrors = 0;
69
FILE *errout; /* set to stderr or stdout */
72
/* char encoding used when replacing illegal SGML chars, regardless of specified encoding */
73
int ReplacementCharEncoding = WIN1252; /* by default */
75
#define UNICODE_BOM_BE 0xFEFF /* this is the big-endian (default) UNICODE BOM */
76
#define UNICODE_BOM UNICODE_BOM_BE
77
#define UNICODE_BOM_LE 0xFFFE /* this is the little-endian UNICODE BOM */
78
#define UNICODE_BOM_UTF8 0xEFBBBF /* this is the UTF-8 UNICODE BOM */
81
Private unget buffer for the raw bytes read from the input stream.
82
Normally this will only be used by the UTF-8 decoder to resynchronize the
83
input stream after finding an illegal UTF-8 sequences.
84
But it can be used for other purposes when reading bytes in ReadCharFromStream.
87
static unsigned char rawBytebuf[CHARBUF_SIZE];
88
static int rawBufpos = 0;
89
static Bool rawPushed = no;
91
/* Mapping for Windows Western character set CP 1252 (chars 128-159/U+0080-U+009F) to Unicode */
92
uint Win2Unicode[32] =
94
0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
95
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
96
0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
100
/* Function for conversion from Windows-1252 to Unicode */
101
uint DecodeWin1252(uint c)
103
if (127 < c && c < 160)
104
c = Win2Unicode[c - 128];
110
John Love-Jensen contributed this table for mapping MacRoman
111
character set to Unicode
114
/* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
115
uint Mac2Unicode[128] =
119
0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
120
0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
122
0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
123
0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
125
0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
126
0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
128
0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
129
/* =BD U+2126 OHM SIGN */
130
0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
132
0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
133
0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
135
0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
136
/* =DB U+00A4 CURRENCY SIGN */
137
0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
139
0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
140
0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
141
/* xF0 = Apple Logo */
142
/* =F0 U+2665 BLACK HEART SUIT */
143
0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
144
0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
147
/* Function to convert from MacRoman to Unicode */
148
uint DecodeMacRoman(uint c)
151
c = Mac2Unicode[c - 128];
157
Table to map symbol font characters to Unicode; undefined
158
characters are mapped to 0x0000 and characters without any
159
Unicode equivalent are mapped to '?'. Is this appropriate?
162
uint Symbol2Unicode[] =
164
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
165
0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
167
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
168
0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
170
0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
171
0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
173
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
174
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
176
0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
177
0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
179
0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
180
0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
182
0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
183
0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
185
0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
186
0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
188
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
189
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
191
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
192
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
194
0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
195
0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
197
0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
198
0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
200
0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
201
0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
203
0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
204
0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
206
0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
207
0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
209
0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
210
0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
213
/* Function to convert from Symbol Font chars to Unicode */
214
uint DecodeSymbolFont(uint c)
219
/* todo: add some error message */
221
return Symbol2Unicode[c];
224
void FatalError(char *msg)
226
fprintf(stderr, "Fatal error: %s\n", msg);
229
if (input && input != stdin)
232
/* 2 signifies a serious error */
236
void *MemAlloc(uint size)
243
FatalError("Out of memory!");
248
void *MemRealloc(void *mem, uint newsize)
252
if (mem == (void *)null)
253
return MemAlloc(newsize);
255
p = realloc(mem, newsize);
258
FatalError("Out of memory!");
263
void MemFree(void *mem)
265
if (mem != (void *)null)
269
void ClearMemory(void *mem, uint size)
271
memset(mem, 0, size);
276
UTF-8 encoding/decoding functions
277
Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
279
Also see below for UTF-16 encoding/decoding functions
283
1) UCS Transformation Format 8 (UTF-8):
284
ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
285
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
286
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
288
Table 4 - Mapping from UCS-4 to UTF-8
290
2) Unicode standards:
291
<http://www.unicode.org/unicode/standard/standard.html>
293
3) Legal UTF-8 byte sequences:
294
<http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
296
Code point 1st byte 2nd byte 3rd byte 4th byte
297
---------- -------- -------- -------- --------
298
U+0000..U+007F 00..7F
299
U+0080..U+07FF C2..DF 80..BF
300
U+0800..U+0FFF E0 A0..BF 80..BF
301
U+1000..U+FFFF E1..EF 80..BF 80..BF
302
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
303
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
304
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
306
The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows for the use of
307
five- and six-byte sequences to encode characters that are outside the range of the Unicode
308
character set; those five- and six-byte sequences are illegal for the use of UTF-8 as a
309
transformation of Unicode characters. ISO/IEC 10646 does not allow mapping of
310
unpaired surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).
312
4) RFC 2279: UTF-8, a transformation format of ISO 10646:
313
<http://www.ietf.org/rfc/rfc2279.txt>
315
5) UTF-8 and Unicode FAQ:
316
<http://www.cl.cam.ac.uk/~mgk25/unicode.html>
318
6) Markus Kuhn's UTF-8 decoder stress test file:
319
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
322
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
325
<http://www.columbia.edu/kermit/utf8.html>
327
9) Transformation Format for 16 Planes of Group 00 (UTF-16):
328
ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
329
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
330
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
332
10) RFC 2781: UTF-16, an encoding of ISO 10646:
333
<http://www.ietf.org/rfc/rfc2781.txt>
335
11) UTF-16 invalid surrogate pairs:
336
<http://www.unicode.org/unicode/faq/utf_bom.html#16>
339
D83F DFF* F0 9F BF B* 0001FFF*
340
D87F DFF* F0 AF BF B* 0002FFF*
341
D8BF DFF* F0 BF BF B* 0003FFF*
342
D8FF DFF* F1 8F BF B* 0004FFF*
343
D93F DFF* F1 9F BF B* 0005FFF*
344
D97F DFF* F1 AF BF B* 0006FFF*
346
DBBF DFF* F3 BF BF B* 000FFFF*
347
DBFF DFF* F4 8F BF B* 0010FFF*
360
#define kNumUTF8Sequences 7
361
#define kMaxUTF8Bytes 4
363
#define kUTF8ByteSwapNotAChar 0xFFFE
364
#define kUTF8NotAChar 0xFFFF
366
#define kMaxUTF8FromUCS4 0x10FFFF
368
#define kUTF16SurrogatesBegin 0x10000
369
#define kMaxUTF16FromUCS4 0x10FFFF
371
/* UTF-16 surrogate pair areas */
372
#define kUTF16LowSurrogateBegin 0xD800
373
#define kUTF16LowSurrogateEnd 0xDBFF
374
#define kUTF16HighSurrogateBegin 0xDC00
375
#define kUTF16HighSurrogateEnd 0xDFFF
377
/* offsets into validUTF8 table below */
378
static int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
384
kNumUTF8Sequences /* must be last */
387
static struct validUTF8Sequence
389
unsigned int lowChar;
390
unsigned int highChar;
392
unsigned char validBytes[8];
393
} validUTF8[kNumUTF8Sequences] =
395
/* low high #bytes byte 1 byte 2 byte 3 byte 4 */
396
{0x0000, 0x007F, 1, 0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
397
{0x0080, 0x07FF, 2, 0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00},
398
{0x0800, 0x0FFF, 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00},
399
{0x1000, 0xFFFF, 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00},
400
{0x10000, 0x3FFFF, 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF},
401
{0x40000, 0xFFFFF, 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF},
402
{0x100000, 0x10FFFF, 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}
405
int DecodeUTF8BytesToChar(uint *c, uint firstByte, unsigned char *successorBytes,
406
StreamIn *in, GetBytes getter, int *count)
408
unsigned char tempbuf[10];
409
unsigned char *buf = &tempbuf[0];
415
buf = successorBytes;
417
/* special check if we have been passed an EOF char */
418
if (/* (in && feof(in->file)) || */ firstByte == (uint)EndOfStream)
426
ch = firstByte; /* first byte is passed in separately */
428
if (ch <= 0x7F) /* 0XXX XXXX one byte */
433
else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
438
else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
443
else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
448
else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
454
else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
462
/* not a valid first byte of a UTF-8 sequence */
468
for (i = 1; i < bytes; ++i)
470
int tempCount; /* no. of additional bytes to get */
472
/* successor bytes should have the form 10XX XXXX */
473
if ( getter != null && (bytes - i) > 0 )
475
tempCount = 1; /* to simplify things, get 1 byte at a time */
476
getter(in, (unsigned char *)&buf[i - 1], &tempCount, no);
477
if (tempCount <= 0) /* EOF */
485
if ((buf[i - 1] & 0xC0) != 0x80)
487
/* illegal successor byte value */
492
tempCount = 1; /* to simplify things, unget 1 byte at a time */
493
getter(in, (unsigned char *)&buf[i - 1], &tempCount, yes); /* Unget the byte */
498
n = (n << 6) | (buf[i - 1] & 0x3F);
501
if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
504
if (!hasError && (n > kMaxUTF8FromUCS4))
507
if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
508
/* unpaired surrogates not allowed */
515
lo = offsetUTF8Sequences[bytes - 1];
516
hi = offsetUTF8Sequences[bytes] - 1;
518
/* check for overlong sequences */
519
if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
523
hasError = yes; /* assume error until proven otherwise */
525
for (i = lo; i <= hi; i++)
528
unsigned char theByte;
530
for (tempCount = 0; tempCount < bytes; tempCount++)
535
theByte = buf[tempCount - 1];
537
if ((theByte >= validUTF8[i].validBytes[(tempCount * 2)]) &&
538
(theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1]))
555
tidy_out(errout, "UTF-8 decoding error of %d bytes : ", bytes);
556
tidy_out(errout, "0x%02x ", firstByte);
557
for (i = 1; i < bytes; i++)
558
tidy_out(errout, "0x%02x ", buf[i - 1]);
559
tidy_out(errout, " = U+%04lx\n", n);
562
/* n = 0xFFFD; */ /* replacement char - do this in the caller */
569
int EncodeCharToUTF8Bytes(uint c, unsigned char *encodebuf,
570
Out *out, PutBytes putter, int *count)
572
unsigned char tempbuf[10];
573
unsigned char *buf = &tempbuf[0];
580
if (c <= 0x7F) /* 0XXX XXXX one byte */
585
else if (c <= 0x7FF) /* 110X XXXX two bytes */
587
buf[0] = (0xC0 | (c >> 6));
588
buf[1] = (0x80 | (c & 0x3F));
591
else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
593
buf[0] = (0xE0 | (c >> 12));
594
buf[1] = (0x80 | ((c >> 6) & 0x3F));
595
buf[2] = (0x80 | (c & 0x3F));
597
if ((c == kUTF8ByteSwapNotAChar) || (c == kUTF8NotAChar))
599
else if ((c >= kUTF16LowSurrogateBegin) && (c <= kUTF16HighSurrogateEnd))
600
/* unpaired surrogates not allowed */
603
else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
605
buf[0] = (0xF0 | (c >> 18));
606
buf[1] = (0x80 | ((c >> 12) & 0x3F));
607
buf[2] = (0x80 | ((c >> 6) & 0x3F));
608
buf[3] = (0x80 | (c & 0x3F));
610
if (c > kMaxUTF8FromUCS4)
613
else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
615
buf[0] = (0xF8 | (c >> 24));
616
buf[1] = (0x80 | (c >> 18));
617
buf[2] = (0x80 | ((c >> 12) & 0x3F));
618
buf[3] = (0x80 | ((c >> 6) & 0x3F));
619
buf[4] = (0x80 | (c & 0x3F));
623
else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
625
buf[0] = (0xFC | (c >> 30));
626
buf[1] = (0x80 | ((c >> 24) & 0x3F));
627
buf[2] = (0x80 | ((c >> 18) & 0x3F));
628
buf[3] = (0x80 | ((c >> 12) & 0x3F));
629
buf[4] = (0x80 | ((c >> 6) & 0x3F));
630
buf[5] = (0x80 | (c & 0x3F));
637
/* don't output invalid UTF-8 byte sequence to a stream */
638
if ( !hasError && putter != null )
640
int tempCount = bytes;
642
putter(out, buf, &tempCount);
643
if (tempCount < bytes)
655
tidy_out(errout, "UTF-8 encoding error for U+%x : ", c);
656
for (i = 0; 0 < bytes; i++)
657
tidy_out(errout, "0x%02x ", buf[i]);
658
tidy_out(errout, "\n");
667
StreamIn *OpenInput(FILE *fp)
671
in = (StreamIn *)MemAlloc(sizeof(StreamIn));
675
in->charbuf[0] = '\0';
679
in->encoding = inCharEncoding;
680
in->state = FSM_ASCII;
686
Read raw bytes from stream, return <= 0 if EOF; or if
687
"unget" is true, Unget the bytes to re-synchronize the input stream
688
Normally UTF-8 successor bytes are read using this routine.
690
static void ReadRawBytesFromStream(StreamIn *in, unsigned char *buf, int *count, Bool unget)
694
for (i = 0; i < *count; i++)
698
/* should never get here; testing for 0xFF, a valid char, is not a good idea */
699
if ((in && feof(in->file)) /* || buf[i] == (unsigned char)EndOfStream */)
701
/* tidy_out(errout, "Attempt to unget EOF in ReadRawBytesFromStream\n"); */ /* debug */
708
if (rawBufpos >= CHARBUF_SIZE)
710
memcpy(rawBytebuf, rawBytebuf + 1, CHARBUF_SIZE - 1);
713
rawBytebuf[rawBufpos++] = buf[i];
719
buf[i] = rawBytebuf[--rawBufpos];
748
/* read char from stream */
749
static int ReadCharFromStream(StreamIn *in)
751
static Bool lookingForBOM = yes;
753
unsigned char tempchar;
757
ReadRawBytesFromStream(in, &tempchar, &count, no);
764
#if SUPPORT_UTF16_ENCODINGS
766
in->encoding == UTF16 ||
767
in->encoding == UTF16LE ||
768
in->encoding == UTF16BE ||
771
in->encoding == UTF8))
773
/* check for a Byte Order Mark */
785
ReadRawBytesFromStream(in, &tempchar, &count, no);
788
#if SUPPORT_UTF16_ENCODINGS
792
if (bom == UNICODE_BOM_BE)
794
/* big-endian UTF-16 */
795
if (in->encoding != UTF16 && in->encoding != UTF16BE)
797
/* tidy_out(errout, "Input is encoded as UTF16BE\n"); */ /* debug */
798
ReportEncodingError(in->lexer, ENCODING_MISMATCH, UTF16BE); /* fatal error */
800
in->encoding = UTF16BE;
801
inCharEncoding = UTF16BE;
803
return UNICODE_BOM; /* return decoded BOM */
805
else if (bom == UNICODE_BOM_LE)
807
/* little-endian UTF-16 */
808
if (in->encoding != UTF16 && in->encoding != UTF16LE)
810
/* tidy_out(errout, "Input is encoded as UTF16LE\n"); */ /* debug */
811
ReportEncodingError(in->lexer, ENCODING_MISMATCH, UTF16LE); /* fatal error */
813
in->encoding = UTF16LE;
814
inCharEncoding = UTF16LE;
816
return UNICODE_BOM; /* return decoded BOM */
825
ReadRawBytesFromStream(in, &tempchar, &count, no);
828
if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
831
if (in->encoding != UTF8)
833
/* tidy_out(errout, "Input is encoded as UTF8\n"); */ /* debug */
834
ReportEncodingError(in->lexer, ENCODING_MISMATCH, UTF8); /* fatal error */
837
inCharEncoding = UTF8;
839
return UNICODE_BOM; /* return decoded BOM */
843
/* the 2nd and/or 3rd bytes weren't what we were */
844
/* expecting, so unget the extra 2 bytes */
847
if ((rawBufpos + 1) >= CHARBUF_SIZE)
849
memcpy(rawBytebuf, rawBytebuf + 2, CHARBUF_SIZE - 2);
852
/* make sure the bytes are pushed in the right order */
853
rawBytebuf[rawBufpos++] = (unsigned char)c2;
854
rawBytebuf[rawBufpos++] = (unsigned char)c1;
856
/* drop through to code below, with the original char */
864
A document in ISO-2022 based encoding uses some ESC sequences
865
called "designator" to switch character sets. The designators
866
defined and used in ISO-2022-JP are:
868
"ESC" + "(" + ? for ISO646 variants
871
"ESC" + "$" + "(" + ? for multibyte character sets
873
Where ? stands for a single character used to indicate the
874
character set for multibyte characters.
876
Tidy handles this by preserving the escape sequence and
877
setting the top bit of each byte for non-ascii chars. This
878
bit is then cleared on output. The input stream keeps track
879
of the state to determine when to set/clear the bit.
882
if (in->encoding == ISO2022)
884
if (c == 0x1b) /* ESC */
894
in->state = FSM_ESCD;
896
in->state = FSM_ESCP;
898
in->state = FSM_ASCII;
903
in->state = FSM_ESCDP;
905
in->state = FSM_NONASCII;
909
in->state = FSM_NONASCII;
913
in->state = FSM_ASCII;
924
#if SUPPORT_UTF16_ENCODINGS
926
if (in->encoding == UTF16LE)
931
ReadRawBytesFromStream(in, &tempchar, &count, no);
941
if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
946
ReadRawBytesFromStream(in, &tempchar, &count, no);
958
if (in->encoding == UTF8)
961
/* deal with UTF-8 encoded char */
965
if ((c & 0xE0) == 0xC0) /* 110X XXXX two bytes */
970
else if ((c & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
975
else if ((c & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
980
else if ((c & 0xFC) == 0xF8) /* 1111 10XX five bytes */
985
else if ((c & 0xFE) == 0xFC) /* 1111 110X six bytes */
990
else /* 0XXX XXXX one byte */
993
/* successor bytes should have the form 10XX XXXX */
994
for (i = 1; i <= count; ++i)
997
return = EndOfStream;
1001
n = (n << 6) | (c & 0x3F);
1008
/* deal with UTF-8 encoded char */
1012
/* first byte "c" is passed in separately */
1013
err = DecodeUTF8BytesToChar(&n, c, null, in, ReadRawBytesFromStream, &count);
1014
if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1018
/* set error position just before offending character */
1019
in->lexer->lines = in->curline;
1020
in->lexer->columns = in->curcol;
1022
ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, n);
1023
n = 0xFFFD; /* replacement char */
1030
#if SUPPORT_ASIAN_ENCODINGS
1032
/* #431953 - start RJ */
1034
This section is suitable for any "multibyte" variable-width
1035
character encoding in which a one-byte code is less than
1036
128, and the first byte of a two-byte code is greater or
1037
equal to 128. Note that Big5 and ShiftJIS fit into this
1038
kind, even though their second byte may be less than 128
1040
if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1044
else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1047
Rick Cameron pointed out that for Shift_JIS, the values from
1048
0xa1 through 0xdf represent singe-byte characters
1049
(U+FF61 to U+FF9F - half-shift Katakana)
1058
ReadRawBytesFromStream(in, &tempchar, &count, no);
1061
c1 = (uint)tempchar;
1068
/* #431953 - end RJ */
1078
int ReadChar(StreamIn *in)
1084
c = in->charbuf[--(in->bufpos)];
1085
if ((in->bufpos) == 0)
1099
in->lastcol = in->curcol;
1110
c = ReadCharFromStream(in);
1124
in->tabs = tabsize - ((in->curcol - 1) % tabsize) - 1;
1130
/* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
1133
c = ReadCharFromStream(in);
1136
if (c == EndOfStream) /* EOF fix by Terry Teague 12 Aug 01 */
1138
/* c = EndOfStream; */ /* debug */
1149
/* strip control characters, except for Esc */
1154
/* Form Feed is allowed in HTML */
1155
if ((c == '\015') && !XmlTags)
1158
if (0 <= c && c < 32)
1159
continue; /* discard control char */
1161
/* watch out for chars that have already been decoded such as */
1162
/* IS02022, UTF-8 etc, that don't require further decoding */
1166
|| in->encoding == ISO2022
1167
|| in->encoding == UTF8
1169
#if SUPPORT_ASIAN_ENCODINGS
1171
|| in->encoding == SHIFTJIS /* #431953 - RJ */
1172
|| in->encoding == BIG5 /* #431953 - RJ */
1182
#if SUPPORT_UTF16_ENCODINGS
1184
/* handle surrogate pairs */
1185
if ((in->encoding == UTF16LE) || (in->encoding == UTF16) || (in->encoding == UTF16BE))
1187
if (c > kMaxUTF16FromUCS4)
1189
/* invalid UTF-16 value */
1190
ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c);
1193
else if (c >= kUTF16LowSurrogateBegin && c <= kUTF16LowSurrogateEnd) /* high surrogate */
1199
m = ReadCharFromStream(in);
1203
if (m >= kUTF16HighSurrogateBegin && m <= kUTF16HighSurrogateEnd) /* low surrogate */
1205
/* pair found, recombine them */
1206
c = (n - kUTF16LowSurrogateBegin) * 0x400 + (m - kUTF16HighSurrogateBegin) + 0x10000;
1208
/* check for invalid pairs */
1209
if (((c & 0x0000FFFE) == 0x0000FFFE) ||
1210
((c & 0x0000FFFF) == 0x0000FFFF) ||
1211
(c < kUTF16SurrogatesBegin))
1213
ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c);
1219
/* not a valid pair */
1220
ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c);
1222
/* should we unget the just read char? */
1227
/* no recombination needed */
1233
if (in->encoding == MACROMAN)
1234
c = DecodeMacRoman(c);
1236
/* produced e.g. as a side-effect of smart quotes in Word */
1237
/* but can't happen if using MACROMAN encoding */
1238
if (127 < c && c < 160)
1240
int c1, replaceMode;
1242
/* set error position just before offending character */
1243
in->lexer->lines = in->curline;
1244
in->lexer->columns = in->curcol;
1246
if ((in->encoding == WIN1252) || (ReplacementCharEncoding == WIN1252))
1247
c1 = DecodeWin1252(c);
1248
else if (ReplacementCharEncoding == MACROMAN)
1249
c1 = DecodeMacRoman(c);
1251
replaceMode = c1?REPLACED_CHAR:DISCARDED_CHAR;
1253
if ((c1 == 0) && (in->encoding == WIN1252) || (in->encoding == MACROMAN))
1254
ReportEncodingError(in->lexer, VENDOR_SPECIFIC_CHARS | replaceMode, c);
1255
else if ((in->encoding != WIN1252) && (in->encoding != MACROMAN))
1256
ReportEncodingError(in->lexer, INVALID_SGML_CHARS | replaceMode, c);
1262
continue; /* illegal char is discarded */
1271
void UngetChar(int c, StreamIn *in)
1273
if (c == EndOfStream)
1275
/* tidy_out(errout, "Attempt to UngetChar EOF\n"); */ /* debug */
1280
if (in->bufpos >= CHARBUF_SIZE)
1282
memcpy(in->charbuf, in->charbuf + 1, CHARBUF_SIZE - 1);
1285
in->charbuf[(in->bufpos)++] = c;
1290
in->curcol = in->lastcol;
1293
/* like strdup but using MemAlloc */
1294
char *wstrdup(char *str)
1302
for (len = 0; str[len] != '\0'; ++len);
1304
s = (char *)MemAlloc(sizeof(char)*(1+len));
1305
for (p = s; (*p++ = *str++););
1309
/* like strndup but using MemAlloc */
1310
char *wstrndup(char *str, int len)
1314
if (str == null || len < 0)
1317
s = (char *)MemAlloc(sizeof(char)*(1+len));
1321
while (len-- > 0 && (*p++ = *str++));
1327
/* exactly same as strncpy */
1328
void wstrncpy(char *s1, char *s2, int size)
1330
if (s1 != null && s2 != null)
1338
while ((*s1++ = *s2++));
1342
void wstrcpy(char *s1, char *s2)
1344
while ((*s1++ = *s2++));
1347
void wstrcat(char *s1, char *s2)
1352
while ((*s1++ = *s2++));
1355
/* exactly same as strcmp */
1356
int wstrcmp(char *s1, char *s2)
1360
while ((c = *s1) == *s2)
1369
return (*s1 > *s2 ? 1 : -1);
1372
/* returns byte count, not char count */
1373
int wstrlen(char *str)
1384
MS C 4.2 doesn't include strcasecmp.
1385
Note that tolower and toupper won't
1388
int wstrcasecmp(char *s1, char *s2)
1392
while (c = (uint)(*s1), ToLower(c) == ToLower((uint)(*s2)))
1401
return (*s1 > *s2 ? 1 : -1);
1404
int wstrncmp(char *s1, char *s2, int n)
1408
while ((c = *s1) == *s2)
1424
return (*s1 > *s2 ? 1 : -1);
1427
int wstrncasecmp(char *s1, char *s2, int n)
1431
while (c = *s1, tolower(c) == tolower(*s2))
1447
return (*s1 > *s2 ? 1 : -1);
1450
/* return offset of cc from beginning of s1,
1453
int wstrnchr( char *s1, int len1, char cc )
1458
for ( i = 0; i < len1; ++i, ++cp )
1467
Bool wsubstrn( char *s1, int len1, char *s2 )
1469
int i, len2 = wstrlen(s2);
1471
for (i = 0; i <= len1 - len2; ++i)
1473
if (wstrncmp(s1+i, s2, len2) == 0)
1480
Bool wsubstrncase(char *s1, int len1, char *s2 )
1482
int i, len2 = wstrlen(s2);
1484
for (i = 0; i <= len1 - len2; ++i)
1486
if (wstrncasecmp(s1+i, s2, len2) == 0)
1493
Bool wsubstr(char *s1, char *s2)
1495
int i, len1 = wstrlen(s1), len2 = wstrlen(s2);
1497
for (i = 0; i <= len1 - len2; ++i)
1499
if (wstrncasecmp(s1+i, s2, len2) == 0)
1506
/* transform string to lower case */
1507
char *wstrtolower(char *s)
1510
for (i = 0; i < wstrlen(s); ++i)
1511
s[i] = ToLower(s[i]);
1516
/* output UTF-8 bytes to output stream */
1517
static void outcUTF8Bytes(Out *out, unsigned char *buf, int *count)
1521
for (i = 0; i < *count; i++)
1523
putc(buf[i], out->fp);
1527
/* For mac users, should we map Unicode back to MacRoman? */
1528
void outc(uint c, Out *out)
1533
if (out->encoding == MACROMAN)
1541
for (i = 128; i < 256; i++)
1542
if (Mac2Unicode[i - 128] == c)
1553
if (out->encoding == WIN1252)
1555
if (c < 128 || (c > 159 && c < 256))
1561
for (i = 128; i < 160; i++)
1562
if (Win2Unicode[i - 128] == c)
1572
if (out->encoding == UTF8)
1577
else if (c <= 0x7FF)
1579
ch = (0xC0 | (c >> 6)); putc(ch, out->fp);
1580
ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);
1582
else if (c <= 0xFFFF)
1584
ch = (0xE0 | (c >> 12)); putc(ch, out->fp);
1585
ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);
1586
ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);
1588
else if (c <= 0x1FFFFF)
1590
ch = (0xF0 | (c >> 18)); putc(ch, out->fp);
1591
ch = (0x80 | ((c >> 12) & 0x3F)); putc(ch, out->fp);
1592
ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);
1593
ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);
1597
ch = (0xF8 | (c >> 24)); putc(ch, out->fp);
1598
ch = (0x80 | ((c >> 18) & 0x3F)); putc(ch, out->fp);
1599
ch = (0x80 | ((c >> 12) & 0x3F)); putc(ch, out->fp);
1600
ch = (0x80 | ((c >> 6) & 0x3F)); putc(ch, out->fp);
1601
ch = (0x80 | (c & 0x3F)); putc(ch, out->fp);
1608
EncodeCharToUTF8Bytes(c, null, out, outcUTF8Bytes, &count);
1611
/* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
1612
/* replacement char 0xFFFD encoded as UTF-8 */
1613
putc(0xEF, out->fp); putc(0xBF, out->fp); putc(0xBF, out->fp);
1617
else if (out->encoding == ISO2022)
1619
if (c == 0x1b) /* ESC */
1620
out->state = FSM_ESC;
1627
out->state = FSM_ESCD;
1629
out->state = FSM_ESCP;
1631
out->state = FSM_ASCII;
1636
out->state = FSM_ESCDP;
1638
out->state = FSM_NONASCII;
1642
out->state = FSM_NONASCII;
1646
out->state = FSM_ASCII;
1658
#if SUPPORT_UTF16_ENCODINGS
1660
else if (out->encoding == UTF16LE || out->encoding == UTF16BE || out->encoding == UTF16)
1662
int i, numChars = 1;
1665
if (c > kMaxUTF16FromUCS4)
1667
/* invalid UTF-16 value */
1668
/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
1672
else if (c >= kUTF16SurrogatesBegin)
1674
/* encode surrogate pairs */
1676
/* check for invalid pairs */
1677
if (((c & 0x0000FFFE) == 0x0000FFFE) ||
1678
((c & 0x0000FFFF) == 0x0000FFFF))
1680
/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
1686
theChars[0] = (c - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
1687
theChars[1] = (c - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
1695
/* just put the char out */
1699
for (i = 0; i < numChars; i++)
1703
if (out->encoding == UTF16LE)
1705
ch = c & 0xFF; putc(ch, out->fp);
1706
ch = (c >> 8) & 0xFF; putc(ch, out->fp);
1709
else if (out->encoding == UTF16BE || out->encoding == UTF16)
1711
ch = (c >> 8) & 0xFF; putc(ch, out->fp);
1712
ch = c & 0xFF; putc(ch, out->fp);
1719
#if SUPPORT_ASIAN_ENCODINGS
1721
/* #431953 - start RJ */
1722
else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
1728
ch = (c >> 8) & 0xFF; putc(ch, out->fp);
1729
ch = c & 0xFF; putc(ch, out->fp);
1732
/* #431953 - end RJ */
1740
/* Output a Byte Order Mark if required */
1741
void outBOM(Out *out)
1744
out->encoding == UTF8
1746
#if SUPPORT_UTF16_ENCODINGS
1748
|| out->encoding == UTF16LE
1749
|| out->encoding == UTF16BE
1750
|| out->encoding == UTF16
1754
outc(UNICODE_BOM, out); /* this will take care of encoding the BOM correctly */
1758
first time initialization which should
1759
precede reading the command line
1769
totalerrors = totalwarnings = 0;
1770
XmlTags = XmlOut = HideEndTags = UpperCaseTags =
1771
MakeBare = MakeClean = writeback = OnlyErrors = no;
1778
ParseConfigFile(CONFIG_FILE);
1783
call this when you have finished with tidy
1784
to free the hash tables and other resources
1786
void DeInitTidy(void)
1795
int main(int argc, char **argv)
1798
Node *document, *doctype;
1800
char *s, c, *arg, *current_errorfile = "stderr";
1801
Out out; /* normal output stream */
1802
Bool InputHadBOM = no;
1804
#if PRESERVE_FILE_TIMES
1805
struct utimbuf filetimes;
1812
/* look for env var "HTML_TIDY" */
1813
/* then for ~/.tidyrc (on Unix) */
1815
if ((file = getenv("HTML_TIDY")))
1816
ParseConfigFile(file);
1817
#ifdef SUPPORT_GETPWNAM
1819
ParseConfigFile("~/.tidyrc");
1820
#endif /* SUPPORT_GETPWNAM */
1822
/* read command line */
1828
if (argc > 1 && argv[1][0] == '-')
1830
/* support -foo and --foo */
1836
/* #427667 - fix by Randy Waki 04 Aug 00 */
1838
if (wstrcasecmp(arg, "indent") == 0)
1839
IndentContent = yes;
1840
else */ if (wstrcasecmp(arg, "xml") == 0)
1842
else if (wstrcasecmp(arg, "asxml") == 0 ||
1843
wstrcasecmp(arg, "asxhtml") == 0)
1847
else if (wstrcasecmp(arg, "ashtml") == 0)
1851
else if (wstrcasecmp(arg, "indent") == 0)
1853
IndentContent = yes;
1856
else if (wstrcasecmp(arg, "omit") == 0)
1858
else if (wstrcasecmp(arg, "upper") == 0)
1859
UpperCaseTags = yes;
1860
else if (wstrcasecmp(arg, "clean") == 0)
1862
else if (wstrcasecmp(arg, "bare") == 0)
1864
else if (wstrcasecmp(arg, "raw") == 0)
1865
AdjustCharEncoding(RAW);
1866
else if (wstrcasecmp(arg, "ascii") == 0)
1867
AdjustCharEncoding(ASCII);
1868
else if (wstrcasecmp(arg, "latin1") == 0)
1869
AdjustCharEncoding(LATIN1);
1870
else if (wstrcasecmp(arg, "utf8") == 0)
1871
AdjustCharEncoding(UTF8);
1872
else if (wstrcasecmp(arg, "iso2022") == 0)
1873
AdjustCharEncoding(ISO2022);
1874
else if (wstrcasecmp(arg, "mac") == 0)
1875
AdjustCharEncoding(MACROMAN);
1877
#if SUPPORT_UTF16_ENCODINGS
1879
else if (wstrcasecmp(arg, "utf16le") == 0)
1880
AdjustCharEncoding(UTF16LE);
1881
else if (wstrcasecmp(arg, "utf16be") == 0)
1882
AdjustCharEncoding(UTF16BE);
1883
else if (wstrcasecmp(arg, "utf16") == 0)
1884
AdjustCharEncoding(UTF16);
1888
else if (wstrcasecmp(arg, "win1252") == 0)
1889
AdjustCharEncoding(WIN1252);
1891
#if SUPPORT_ASIAN_ENCODINGS
1893
else if (wstrcasecmp(arg, "shiftjis") == 0) /* #431953 - RJ */
1894
AdjustCharEncoding(SHIFTJIS);
1895
else if (wstrcasecmp(arg, "big5") == 0) /* #431953 - RJ */
1896
AdjustCharEncoding(BIG5);
1900
else if (wstrcasecmp(arg, "numeric") == 0)
1902
else if (wstrcasecmp(arg, "modify") == 0)
1904
else if (wstrcasecmp(arg, "change") == 0) /* obsolete */
1906
else if (wstrcasecmp(arg, "update") == 0) /* obsolete */
1908
else if (wstrcasecmp(arg, "errors") == 0)
1910
else if (wstrcasecmp(arg, "quiet") == 0)
1912
else if (wstrcasecmp(arg, "slides") == 0)
1914
else if (wstrcasecmp(arg, "help") == 0 ||
1915
wstrcasecmp(arg, "h") == 0 ||
1918
HelpText(stdout, prog);
1920
DeInitTidy(); /* called to free hash tables etc. */
1921
return 0; /* was return 1 */
1923
else if (wstrcasecmp(arg, "help-config") == 0)
1925
PrintConfigOptions(stdout, no);
1927
DeInitTidy(); /* called to free hash tables etc. */
1937
else if (wstrcasecmp(arg, "show-config") == 0)
1939
AdjustConfig(); /* ensure config is self-consistent */
1940
PrintConfigOptions(errout, yes);
1942
DeInitTidy(); /* called to free hash tables etc. */
1952
else if (wstrcasecmp(arg, "config") == 0)
1956
ParseConfigFile(argv[2]);
1962
#if SUPPORT_ASIAN_ENCODINGS
1964
/* #431953 - start RJ */
1965
else if (wstrcasecmp(arg, "language") == 0 ||
1966
wstrcasecmp(arg, "lang") == 0)
1975
/* #431953 - end RJ */
1979
else if (wstrcasecmp(arg, "file") == 0 ||
1980
wstrcasecmp(arg, "-file") == 0 ||
1981
wstrcasecmp(arg, "f") == 0)
1985
/* create copy that can be freed by FreeConfig() */
1986
errfile = wstrdup(argv[2]);
1991
else if (wstrcasecmp(arg, "wrap") == 0 ||
1992
wstrcasecmp(arg, "-wrap") == 0 ||
1993
wstrcasecmp(arg, "w") == 0)
1997
sscanf(argv[2], "%d", &wraplen);
2002
else if (wstrcasecmp(arg, "version") == 0 ||
2003
wstrcasecmp(arg, "-version") == 0 ||
2004
wstrcasecmp(arg, "v") == 0)
2006
ShowVersion(errout);
2007
/* called to free hash tables etc. */
2012
else if (strncmp(argv[1], "--", 2) == 0)
2014
if (ParseConfig(argv[1] + 2, argv[2]))
2028
IndentContent = yes;
2034
UpperCaseTags = yes;
2048
UnknownOption(stderr, c);
2057
/* ensure config is self-consistent */
2060
/* user specified error file */
2065
/* is it same as the currently opened file? */
2067
/* this comparison could be an issue on filesystems that are not case-sensitive */
2068
/* e.g. Mac OS HFS; but if we use wstrcasecmp(), we will have the same issue on */
2069
/* file systems that are case-sensitive - e.g. UFS */
2070
if (wstrcmp(errfile, current_errorfile) != 0)
2072
/* no so close previous error file */
2074
if (errout != stderr)
2077
/* and try to open the new error file */
2078
fp = fopen(errfile, "w");
2083
current_errorfile = errfile;
2085
else /* can't be opened so fall back to stderr */
2088
current_errorfile = "stderr";
2098
input = fopen(file, "r");
2100
#if PRESERVE_FILE_TIMES
2101
/* get last modified time */
2102
if (KeepFileTimes && input && fstat(fileno(input), &sbuf) != -1)
2104
filetimes.actime = sbuf.st_atime;
2105
filetimes.modtime = sbuf.st_mtime;
2106
haveFileTimes = yes;
2118
lexer = NewLexer(OpenInput(input));
2119
lexer->errout = errout;
2122
store pointer to lexer in input stream
2123
to allow character encoding errors to be
2126
lexer->in->lexer = lexer;
2128
SetFilename(file); /* #431895 - fix by Dave Bryan 04 Jan 01 */
2131
HelloMessage(errout, release_date, file);
2133
/* skip byte order mark */
2134
if (lexer->in->encoding == UTF8
2136
#if SUPPORT_UTF16_ENCODINGS
2138
|| lexer->in->encoding == UTF16LE
2139
|| lexer->in->encoding == UTF16BE
2140
|| lexer->in->encoding == UTF16
2146
uint c = ReadChar(lexer->in);
2148
if (c == UNICODE_BOM)
2151
UngetChar(c, lexer->in);
2154
/* Tidy doesn't alter the doctype for generic XML docs */
2157
document = ParseXMLDocument(lexer);
2159
if (!CheckNodeIntegrity(document))
2161
fprintf(stderr, "\nPanic - tree has lost its integrity\n");
2167
lexer->warnings = 0;
2169
document = ParseDocument(lexer);
2171
if (!CheckNodeIntegrity(document))
2173
fprintf(stderr, "\nPanic - tree has lost its integrity\n");
2177
/* simplifies <b><b> ... </b> ...</b> etc. */
2178
NestedEmphasis(document);
2180
/* cleans up <dir>indented text</dir> etc. */
2184
/* replaces i by em and b by strong */
2185
if (LogicalEmphasis)
2188
if (Word2000 && IsWord2000(document))
2190
/* prune Word2000's <![if ...]> ... <![endif]> */
2191
DropSections(lexer, document);
2193
/* drop style & class attributes and empty p, span elements */
2194
CleanWord2000(lexer, document);
2197
/* replaces presentational markup by style rules */
2198
if (MakeClean || DropFontTags)
2199
CleanTree(lexer, document);
2201
if (!CheckNodeIntegrity(document))
2203
fprintf(stderr, "\nPanic - tree has lost its integrity\n");
2207
if (document->content)
2210
SetXHTMLDocType(lexer, document);
2212
FixDocType(lexer, document);
2215
AddGenerator(lexer, document);
2218
/* ensure presence of initial <?XML version="1.0"?> */
2219
if (XmlOut && XmlPi)
2220
FixXmlDecl(lexer, document);
2223
totalwarnings += lexer->warnings;
2224
totalerrors += lexer->errors;
2227
if (!Quiet && document->content)
2229
doctype = FindDocType(document);
2230
ReportVersion(errout, lexer, file, doctype);
2231
/* ReportNumWarnings(errout, lexer); */
2242
totalwarnings += lexer->warnings;
2243
totalerrors += lexer->errors;
2246
ReportNumWarnings(errout, lexer);
2248
if (lexer->errors > 0 && !ForceOutput)
2249
NeedsAuthorIntervention(errout);
2251
out.state = FSM_ASCII;
2252
out.encoding = outCharEncoding;
2254
if (!OnlyErrors && (lexer->errors == 0 || ForceOutput))
2258
Node *body, *doctype;
2261
remove doctype to avoid potential clash with
2262
markup introduced when bursting into slides
2264
/* discard the document type */
2265
doctype = FindDocType(document);
2268
DiscardElement(doctype);
2270
/* slides use transitional features */
2271
lexer->versions |= VERS_HTML40_LOOSE;
2273
/* and patch up doctype to match */
2275
SetXHTMLDocType(lexer, document);
2277
FixDocType(lexer, document);
2280
/* find the body element which may be implicit */
2281
body = FindBody(document);
2285
ReportNumberOfSlides(errout, CountSlides(body));
2286
CreateSlides(lexer, document);
2289
MissingBody(errout);
2291
else if (writeback && (input = fopen(file, "w")))
2295
/* Output a Byte Order Mark if required */
2296
if (OutputBOM || (InputHadBOM && SmartBOM))
2299
if (!FindDocType(document))
2302
if (XmlOut && !xHTML /*XmlTags*/) /* #427826 - fix by Dave Raggett 01 Sep 00 */
2303
PPrintXMLTree(&out, null, 0, lexer, document);
2304
/* Feature request #434940 - fix by Dave Raggett/Ignacio Vazquez-Abrams 21 Jun 01 */
2306
PrintBody(&out, lexer, document);
2308
PPrintTree(&out, null, 0, lexer, document);
2310
PFlushLine(&out, 0);
2312
#if PRESERVE_FILE_TIMES
2314
#if UTIME_NEEDS_CLOSED_FILE
2315
/* close the file first */
2319
/* set file last accessed/modified times to original values */
2322
utime(file, &filetimes);
2324
futime(fileno(input), &filetimes);
2327
#if !UTIME_NEEDS_CLOSED_FILE
2328
/* close the file later */
2336
#endif /* PRESERVFILETIMES */
2342
/* Output a Byte Order Mark if required */
2343
if (OutputBOM || (InputHadBOM && SmartBOM))
2346
if (!FindDocType(document))
2349
if (XmlOut && !xHTML /*XmlTags*/) /* #427826 - fix by Dave Raggett 01 Sep 00 */
2350
PPrintXMLTree(&out, null, 0, lexer, document);
2351
/* Feature request #434940 - fix by Dave Raggett/Ignacio Vazquez-Abrams 21 Jun 01 */
2353
PrintBody(&out, lexer, document);
2355
PPrintTree(&out, null, 0, lexer, document);
2357
PFlushLine(&out, 0);
2363
ErrorSummary(lexer);
2369
UnknownFile(errout, prog, file);
2378
if (totalerrors + totalwarnings > 0 && !Quiet)
2379
GeneralInfo(errout);
2381
if (errout != stderr)
2384
/* called to free hash tables etc. */
2387
/* return status can be used by scripts */
2389
if (totalerrors > 0)
2392
if (totalwarnings > 0)
2395
/* 0 signifies all is ok */