1
/* streamio.c -- handles character stream I/O
3
(c) 1998-2005 (W3C) MIT, ERCIM, Keio University
4
See tidy.h for the copyright notice.
9
$Date: 2005/04/08 09:11:13 $
12
Wrapper around Tidy input source and output sink
13
that calls appropriate interfaces, and applies
14
necessary char encoding transformations: to/from
15
ISO-10646 and/or UTF-8.
29
#ifdef TIDY_WIN32_MLANG_SUPPORT
33
/************************
34
** Forward Declarations
35
************************/
37
static uint ReadCharFromStream( StreamIn* in );
39
static uint ReadByte( StreamIn* in );
40
static void UngetByte( StreamIn* in, uint byteValue );
42
static void PutByte( uint byteValue, StreamOut* out );
44
static void EncodeWin1252( uint c, StreamOut* out );
45
static void EncodeMacRoman( uint c, StreamOut* out );
46
static void EncodeIbm858( uint c, StreamOut* out );
47
static void EncodeLatin0( uint c, StreamOut* out );
49
/******************************
50
** Static (duration) Globals
51
******************************/
53
static StreamOut stderrStreamOut =
58
#ifdef TIDY_WIN32_MLANG_SUPPORT
62
{ 0, filesink_putByte }
65
static StreamOut stdoutStreamOut =
70
#ifdef TIDY_WIN32_MLANG_SUPPORT
74
{ 0, filesink_putByte }
77
StreamOut* StdErrOutput(void)
79
if ( stderrStreamOut.sink.sinkData == 0 )
80
stderrStreamOut.sink.sinkData = (ulong) stderr;
81
return &stderrStreamOut;
84
StreamOut* StdOutOutput(void)
86
if ( stdoutStreamOut.sink.sinkData == 0 )
87
stdoutStreamOut.sink.sinkData = (ulong) stdout;
88
return &stdoutStreamOut;
91
void ReleaseStreamOut( StreamOut* out )
93
if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
95
if ( out->iotype == FileIO )
96
fclose( (FILE*) out->sink.sinkData );
102
/************************
104
************************/
106
static StreamIn* initStreamIn( TidyDocImpl* doc, int encoding )
108
StreamIn *in = (StreamIn*) MemAlloc( sizeof(StreamIn) );
110
ClearMemory( in, sizeof(StreamIn) );
113
in->encoding = encoding;
114
in->state = FSM_ASCII;
116
in->bufsize = CHARBUF_SIZE;
117
in->charbuf = MemAlloc(sizeof(tchar) * in->bufsize);
118
#ifdef TIDY_STORE_ORIGINAL_TEXT
126
void freeStreamIn(StreamIn* in)
128
#ifdef TIDY_STORE_ORIGINAL_TEXT
130
MemFree(in->otextbuf);
132
MemFree(in->charbuf);
136
StreamIn* FileInput( TidyDocImpl* doc, FILE *fp, int encoding )
138
StreamIn *in = initStreamIn( doc, encoding );
139
initFileSource( &in->source, fp );
144
StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
146
StreamIn *in = initStreamIn( doc, encoding );
147
initInputBuffer( &in->source, buf );
148
in->iotype = BufferIO;
152
StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding )
154
StreamIn *in = initStreamIn( doc, encoding );
155
memcpy( &in->source, source, sizeof(TidyInputSource) );
160
int ReadBOMEncoding(StreamIn *in)
163
#if SUPPORT_UTF16_ENCODINGS
168
if (c == EndOfStream)
172
if (c1 == EndOfStream)
178
/* todo: dont warn about mismatch for auto input encoding */
179
/* todo: let the user override the encoding found here */
181
#if SUPPORT_UTF16_ENCODINGS
184
if ( bom == UNICODE_BOM_BE )
186
/* big-endian UTF-16 */
187
if ( in->encoding != UTF16 && in->encoding != UTF16BE )
188
ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16BE);
190
return UTF16BE; /* return decoded BOM */
192
else if (bom == UNICODE_BOM_LE)
194
/* little-endian UTF-16 */
195
if (in->encoding != UTF16 && in->encoding != UTF16LE)
196
ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16LE);
198
return UTF16LE; /* return decoded BOM */
201
#endif /* SUPPORT_UTF16_ENCODINGS */
203
uint c2 = ReadByte(in);
205
if (c2 == EndOfStream)
212
if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8)
215
if (in->encoding != UTF8)
216
ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF8);
230
#ifdef TIDY_STORE_ORIGINAL_TEXT
231
void AddByteToOriginalText(StreamIn *in, tmbchar c)
233
if (in->otextlen + 1 >= in->otextsize)
235
size_t size = in->otextsize ? 1 : 2;
236
in->otextbuf = MemRealloc(in->otextbuf, in->otextsize + size);
237
in->otextsize += size;
239
in->otextbuf[in->otextlen++] = c;
240
in->otextbuf[in->otextlen ] = 0;
243
void AddCharToOriginalText(StreamIn *in, tchar c)
245
int i, err, count = 0;
246
tmbchar buf[10] = {0};
248
err = EncodeCharToUTF8Bytes(c, buf, NULL, &count);
252
/* replacement character 0xFFFD encoded as UTF-8 */
253
buf[0] = (byte) 0xEF;
254
buf[1] = (byte) 0xBF;
255
buf[2] = (byte) 0xBD;
259
for (i = 0; i < count; ++i)
260
AddByteToOriginalText(in, buf[i]);
265
uint ReadChar( StreamIn *in )
267
uint c = EndOfStream;
268
uint tabsize = cfg( in->doc, TidyTabSize );
269
#ifdef TIDY_STORE_ORIGINAL_TEXT
274
return PopChar( in );
276
in->lastcol = in->curcol;
287
c = ReadCharFromStream(in);
289
if ( EndOfStream == c )
294
#ifdef TIDY_STORE_ORIGINAL_TEXT
296
AddCharToOriginalText(in, (tchar)c);
305
#ifdef TIDY_STORE_ORIGINAL_TEXT
307
AddCharToOriginalText(in, (tchar)c);
309
in->tabs = tabsize - ((in->curcol - 1) % tabsize) - 1;
315
/* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */
318
#ifdef TIDY_STORE_ORIGINAL_TEXT
320
AddCharToOriginalText(in, (tchar)c);
322
c = ReadCharFromStream(in);
330
#ifdef TIDY_STORE_ORIGINAL_TEXT
331
AddCharToOriginalText(in, (tchar)c);
339
#ifndef NO_NATIVE_ISO2022_SUPPORT
340
/* strip control characters, except for Esc */
345
/* Form Feed is allowed in HTML */
346
if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) )
350
continue; /* discard control char */
352
/* watch out for chars that have already been decoded such as */
353
/* IS02022, UTF-8 etc, that don't require further decoding */
357
#ifndef NO_NATIVE_ISO2022_SUPPORT
358
|| in->encoding == ISO2022
360
|| in->encoding == UTF8
362
#if SUPPORT_ASIAN_ENCODINGS
363
|| in->encoding == SHIFTJIS /* #431953 - RJ */
364
|| in->encoding == BIG5 /* #431953 - RJ */
372
#if SUPPORT_UTF16_ENCODINGS
373
/* handle surrogate pairs */
374
if ( in->encoding == UTF16LE ||
375
in->encoding == UTF16 ||
376
in->encoding == UTF16BE )
378
if ( !IsValidUTF16FromUCS4(c) )
380
/* invalid UTF-16 value */
381
ReportEncodingError(in->doc, INVALID_UTF16, c, yes);
384
else if ( IsLowSurrogate(c) )
387
uint m = ReadCharFromStream( in );
388
if ( m == EndOfStream )
392
if ( IsHighSurrogate(m) )
394
n = CombineSurrogatePair( m, n );
395
if ( IsValidCombinedChar(n) )
398
/* not a valid pair */
400
ReportEncodingError( in->doc, INVALID_UTF16, c, yes );
405
/* Do first: acts on range 128 - 255 */
406
switch ( in->encoding )
409
c = DecodeMacRoman( c );
412
c = DecodeIbm850( c );
415
c = DecodeLatin0( c );
419
/* produced e.g. as a side-effect of smart quotes in Word */
420
/* but can't happen if using MACROMAN encoding */
421
if ( 127 < c && c < 160 )
423
uint c1 = 0, replMode = DISCARDED_CHAR;
424
Bool isVendorChar = ( in->encoding == WIN1252 ||
425
in->encoding == MACROMAN );
426
Bool isWinChar = ( in->encoding == WIN1252 ||
427
ReplacementCharEncoding == WIN1252 );
428
Bool isMacChar = ( in->encoding == MACROMAN ||
429
ReplacementCharEncoding == MACROMAN );
431
/* set error position just before offending character */
432
in->doc->lexer->lines = in->curline;
433
in->doc->lexer->columns = in->curcol;
436
c1 = DecodeWin1252( c );
437
else if ( isMacChar )
438
c1 = DecodeMacRoman( c );
440
replMode = REPLACED_CHAR;
442
if ( c1 == 0 && isVendorChar )
443
ReportEncodingError(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
444
else if ( ! isVendorChar )
445
ReportEncodingError(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
451
continue; /* illegal char is discarded */
457
#ifdef TIDY_STORE_ORIGINAL_TEXT
459
AddCharToOriginalText(in, (tchar)c);
465
uint PopChar( StreamIn *in )
467
uint c = EndOfStream;
470
assert( in->bufpos > 0 );
471
c = in->charbuf[ --in->bufpos ];
472
if ( in->bufpos == 0 )
486
void UngetChar( uint c, StreamIn *in )
488
if (c == EndOfStream)
490
/* fprintf(stderr, "Attempt to UngetChar EOF\n"); */
496
if (in->bufpos + 1 >= in->bufsize)
497
in->charbuf = MemRealloc(in->charbuf, sizeof(tchar) * ++(in->bufsize));
499
in->charbuf[(in->bufpos)++] = c;
504
in->curcol = in->lastcol;
509
/************************
511
************************/
513
static StreamOut* initStreamOut( int encoding, uint nl )
515
StreamOut* out = (StreamOut*) MemAlloc( sizeof(StreamOut) );
516
ClearMemory( out, sizeof(StreamOut) );
517
out->encoding = encoding;
518
out->state = FSM_ASCII;
523
StreamOut* FileOutput( FILE* fp, int encoding, uint nl )
525
StreamOut* out = initStreamOut( encoding, nl );
526
initFileSink( &out->sink, fp );
527
out->iotype = FileIO;
530
StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint nl )
532
StreamOut* out = initStreamOut( encoding, nl );
533
initOutputBuffer( &out->sink, buf );
534
out->iotype = BufferIO;
537
StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint nl )
539
StreamOut* out = initStreamOut( encoding, nl );
540
memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
541
out->iotype = UserIO;
545
void WriteChar( uint c, StreamOut* out )
547
/* Translate outgoing newlines */
550
if ( out->nl == TidyCRLF )
551
WriteChar( CR, out );
552
else if ( out->nl == TidyCR )
556
if (out->encoding == MACROMAN)
558
EncodeMacRoman( c, out );
560
else if (out->encoding == WIN1252)
562
EncodeWin1252( c, out );
564
else if (out->encoding == IBM858)
566
EncodeIbm858( c, out );
568
else if (out->encoding == LATIN0)
570
EncodeLatin0( c, out );
573
else if (out->encoding == UTF8)
577
EncodeCharToUTF8Bytes( c, NULL, &out->sink, &count );
580
/* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
581
/* replacement char 0xFFFD encoded as UTF-8 */
582
PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
585
#ifndef NO_NATIVE_ISO2022_SUPPORT
586
else if (out->encoding == ISO2022)
588
if (c == 0x1b) /* ESC */
589
out->state = FSM_ESC;
596
out->state = FSM_ESCD;
598
out->state = FSM_ESCP;
600
out->state = FSM_ASCII;
605
out->state = FSM_ESCDP;
607
out->state = FSM_NONASCII;
611
out->state = FSM_NONASCII;
615
out->state = FSM_ASCII;
626
#endif /* NO_NATIVE_ISO2022_SUPPORT */
628
#if SUPPORT_UTF16_ENCODINGS
629
else if ( out->encoding == UTF16LE ||
630
out->encoding == UTF16BE ||
631
out->encoding == UTF16 )
636
if ( !IsValidUTF16FromUCS4(c) )
638
/* invalid UTF-16 value */
639
/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
643
else if ( IsCombinedChar(c) )
645
/* output both, unless something goes wrong */
647
if ( !SplitSurrogatePair(c, &theChars[0], &theChars[1]) )
649
/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
656
/* just put the char out */
660
for (i = 0; i < numChars; i++)
664
if (out->encoding == UTF16LE)
666
uint ch = c & 0xFF; PutByte(ch, out);
667
ch = (c >> 8) & 0xFF; PutByte(ch, out);
670
else if (out->encoding == UTF16BE || out->encoding == UTF16)
672
uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
673
ch = c & 0xFF; PutByte(ch, out);
679
#if SUPPORT_ASIAN_ENCODINGS
680
else if (out->encoding == BIG5 || out->encoding == SHIFTJIS)
686
uint ch = (c >> 8) & 0xFF; PutByte(ch, out);
687
ch = c & 0xFF; PutByte(ch, out);
698
/****************************
699
** Miscellaneous / Helpers
700
****************************/
702
/* char encoding used when replacing illegal SGML chars,
703
** regardless of specified encoding. Set at compile time
704
** to either Windows or Mac.
706
const int ReplacementCharEncoding = DFLT_REPL_CHARENC;
709
/* Mapping for Windows Western character set CP 1252
710
** (chars 128-159/U+0080-U+009F) to Unicode.
712
static const uint Win2Unicode[32] =
714
0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
715
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000,
716
0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
717
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178
720
/* Function for conversion from Windows-1252 to Unicode */
721
uint DecodeWin1252(uint c)
723
if (127 < c && c < 160)
724
c = Win2Unicode[c - 128];
729
static void EncodeWin1252( uint c, StreamOut* out )
731
if (c < 128 || (c > 159 && c < 256))
737
for (i = 128; i < 160; i++)
738
if (Win2Unicode[i - 128] == c)
747
John Love-Jensen contributed this table for mapping MacRoman
748
character set to Unicode
751
/* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */
752
static const uint Mac2Unicode[128] =
756
0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1,
757
0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
759
0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3,
760
0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
762
0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF,
763
0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
765
0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211,
766
/* =BD U+2126 OHM SIGN */
767
0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
769
0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB,
770
0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
772
0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA,
773
/* =DB U+00A4 CURRENCY SIGN */
774
0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
776
0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1,
777
0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
778
/* xF0 = Apple Logo */
779
/* =F0 U+2665 BLACK HEART SUIT */
780
0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC,
781
0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7
784
/* Function to convert from MacRoman to Unicode */
785
uint DecodeMacRoman(uint c)
788
c = Mac2Unicode[c - 128];
792
static void EncodeMacRoman( uint c, StreamOut* out )
798
/* For mac users, map Unicode back to MacRoman. */
800
for (i = 128; i < 256; i++)
802
if (Mac2Unicode[i - 128] == c)
811
/* Mapping for OS/2 Western character set CP 850
812
** (chars 128-255) to Unicode.
814
static const uint IBM2Unicode[128] =
816
0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
817
0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
818
0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
819
0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192,
820
0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
821
0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
822
0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0,
823
0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510,
824
0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3,
825
0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
826
0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce,
827
0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
828
0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
829
0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
830
0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
831
0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0
834
/* Function for conversion from OS/2-850 to Unicode */
835
uint DecodeIbm850(uint c)
837
if (127 < c && c < 256)
838
c = IBM2Unicode[c - 128];
843
/* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */
844
static void EncodeIbm858( uint c, StreamOut* out )
851
for (i = 128; i < 256; i++)
853
if (IBM2Unicode[i - 128] == c)
863
/* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */
864
uint DecodeLatin0(uint c)
866
if (159 < c && c < 191)
870
case 0xA4: c = 0x20AC; break;
871
case 0xA6: c = 0x0160; break;
872
case 0xA8: c = 0x0161; break;
873
case 0xB4: c = 0x017D; break;
874
case 0xB8: c = 0x017E; break;
875
case 0xBC: c = 0x0152; break;
876
case 0xBD: c = 0x0153; break;
877
case 0xBE: c = 0x0178; break;
883
/* Map Unicode back to ISO-8859-15. */
884
static void EncodeLatin0( uint c, StreamOut* out )
888
case 0x20AC: c = 0xA4; break;
889
case 0x0160: c = 0xA6; break;
890
case 0x0161: c = 0xA8; break;
891
case 0x017D: c = 0xB4; break;
892
case 0x017E: c = 0xB8; break;
893
case 0x0152: c = 0xBC; break;
894
case 0x0153: c = 0xBD; break;
895
case 0x0178: c = 0xBE; break;
901
Table to map symbol font characters to Unicode; undefined
902
characters are mapped to 0x0000 and characters without any
903
Unicode equivalent are mapped to '?'. Is this appropriate?
906
static const uint Symbol2Unicode[] =
908
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
909
0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F,
911
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,
912
0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F,
914
0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D,
915
0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F,
917
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
918
0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
920
0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393,
921
0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F,
923
0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9,
924
0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F,
926
0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3,
927
0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF,
929
0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9,
930
0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F,
932
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
933
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
935
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
936
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
938
0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663,
939
0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
941
0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7,
942
0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5,
944
0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
945
0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
947
0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5,
948
0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3,
950
0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F,
951
0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F,
953
0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F,
954
0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F
957
/* Function to convert from Symbol Font chars to Unicode */
958
uint DecodeSymbolFont(uint c)
963
/* todo: add some error message */
965
return Symbol2Unicode[c];
969
/* Facilitates user defined source by providing
970
** an entry point to marshal pointers-to-functions.
971
** Needed by .NET and possibly other language bindings.
973
Bool TIDY_CALL tidyInitSource( TidyInputSource* source,
975
TidyGetByteFunc gbFunc,
976
TidyUngetByteFunc ugbFunc,
977
TidyEOFFunc endFunc )
979
Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc );
983
source->sourceData = (ulong) srcData;
984
source->getByte = gbFunc;
985
source->ungetByte = ugbFunc;
986
source->eof = endFunc;
992
Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink,
994
TidyPutByteFunc pbFunc )
996
Bool status = ( sink && snkData && pbFunc );
999
sink->sinkData = (ulong) snkData;
1000
sink->putByte = pbFunc;
1005
/* GetByte must return a byte value in a signed
1006
** integer so that a negative value can signal EOF
1007
** without interfering w/ 0-255 legitimate byte values.
1009
uint TIDY_CALL tidyGetByte( TidyInputSource* source )
1011
int bv = source->getByte( source->sourceData );
1014
Bool TIDY_CALL tidyIsEOF( TidyInputSource* source )
1016
return source->eof( source->sourceData );
1018
void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch )
1020
source->ungetByte( source->sourceData, (byte) ch );
1022
void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch )
1024
sink->putByte( sink->sinkData, (byte) ch );
1027
static uint ReadByte( StreamIn* in )
1029
return tidyGetByte( &in->source );
1031
Bool IsEOF( StreamIn* in )
1033
return tidyIsEOF( &in->source );
1035
static void UngetByte( StreamIn* in, uint byteValue )
1037
tidyUngetByte( &in->source, byteValue );
1039
static void PutByte( uint byteValue, StreamOut* out )
1041
tidyPutByte( &out->sink, byteValue );
1045
static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count )
1049
for (i = 0; i < *count; i++)
1051
/* should never get here; testing for 0xFF, a valid char, is not a good idea */
1052
if ( in && IsEOF(in) )
1054
/* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */
1059
in->source.ungetByte( in->source.sourceData, buf[i] );
1064
Read raw bytes from stream, return <= 0 if EOF; or if
1065
"unget" is true, Unget the bytes to re-synchronize the input stream
1066
Normally UTF-8 successor bytes are read using this routine.
1068
static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count )
1071
for ( ix=0; ix < *count; ++ix )
1073
if ( in->rawPushed )
1075
buf[ix] = in->rawBytebuf[ --in->rawBufpos ];
1076
if ( in->rawBufpos == 0 )
1081
if ( in->source.eof(in->source.sourceData) )
1086
buf[ix] = in->source.getByte( in->source.sourceData );
1092
/* read char from stream */
1093
static uint ReadCharFromStream( StreamIn* in )
1096
#ifdef TIDY_WIN32_MLANG_SUPPORT
1105
if (c == EndOfStream)
1108
#ifndef NO_NATIVE_ISO2022_SUPPORT
1110
A document in ISO-2022 based encoding uses some ESC sequences
1111
called "designator" to switch character sets. The designators
1112
defined and used in ISO-2022-JP are:
1114
"ESC" + "(" + ? for ISO646 variants
1117
"ESC" + "$" + "(" + ? for multibyte character sets
1119
Where ? stands for a single character used to indicate the
1120
character set for multibyte characters.
1122
Tidy handles this by preserving the escape sequence and
1123
setting the top bit of each byte for non-ascii chars. This
1124
bit is then cleared on output. The input stream keeps track
1125
of the state to determine when to set/clear the bit.
1128
if (in->encoding == ISO2022)
1130
if (c == 0x1b) /* ESC */
1132
in->state = FSM_ESC;
1140
in->state = FSM_ESCD;
1142
in->state = FSM_ESCP;
1144
in->state = FSM_ASCII;
1149
in->state = FSM_ESCDP;
1151
in->state = FSM_NONASCII;
1155
in->state = FSM_NONASCII;
1159
in->state = FSM_ASCII;
1169
#endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */
1171
#if SUPPORT_UTF16_ENCODINGS
1172
if ( in->encoding == UTF16LE )
1174
uint c1 = ReadByte( in );
1175
if ( EndOfStream == c1 )
1181
if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */
1183
uint c1 = ReadByte( in );
1184
if ( EndOfStream == c1 )
1191
if ( in->encoding == UTF8 )
1193
/* deal with UTF-8 encoded char */
1197
/* first byte "c" is passed in separately */
1198
err = DecodeUTF8BytesToChar( &n, c, NULL, &in->source, &count );
1199
if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1203
/* set error position just before offending character */
1204
in->doc->lexer->lines = in->curline;
1205
in->doc->lexer->columns = in->curcol;
1207
ReportEncodingError(in->doc, INVALID_UTF8, n, no);
1208
n = 0xFFFD; /* replacement char */
1214
#if SUPPORT_ASIAN_ENCODINGS
1216
This section is suitable for any "multibyte" variable-width
1217
character encoding in which a one-byte code is less than
1218
128, and the first byte of a two-byte code is greater or
1219
equal to 128. Note that Big5 and ShiftJIS fit into this
1220
kind, even though their second byte may be less than 128
1222
if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS))
1226
else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */
1229
Rick Cameron pointed out that for Shift_JIS, the values from
1230
0xa1 through 0xdf represent singe-byte characters
1231
(U+FF61 to U+FF9F - half-shift Katakana)
1237
uint c1 = ReadByte( in );
1238
if ( EndOfStream == c1 )
1246
#ifdef TIDY_WIN32_MLANG_SUPPORT
1247
else if (in->encoding > WIN32MLANG)
1249
assert( in->mlang != 0 );
1250
return Win32MLangGetChar((byte)c, in, &bytesRead);
1260
/* Output a Byte Order Mark if required */
1261
void outBOM( StreamOut *out )
1263
if ( out->encoding == UTF8
1264
#if SUPPORT_UTF16_ENCODINGS
1265
|| out->encoding == UTF16LE
1266
|| out->encoding == UTF16BE
1267
|| out->encoding == UTF16
1271
/* this will take care of encoding the BOM correctly */
1272
WriteChar( UNICODE_BOM, out );
1276
/* this is in intermediate fix for various problems in the */
1277
/* long term code and data in charsets.c should be used */
1278
static struct _enc2iana
1282
ctmbstr tidyOptName;
1283
} const enc2iana[] =
1285
{ ASCII, "us-ascii", "ascii" },
1286
{ LATIN0, "iso-8859-15", "latin0" },
1287
{ LATIN1, "iso-8859-1", "latin1" },
1288
{ UTF8, "utf-8", "utf8" },
1289
{ MACROMAN, "macintosh", "mac" },
1290
{ WIN1252, "windows-1252", "win1252" },
1291
{ IBM858, "ibm00858", "ibm858" },
1292
#if SUPPORT_UTF16_ENCODINGS
1293
{ UTF16LE, "utf-16", "utf16le" },
1294
{ UTF16BE, "utf-16", "utf16be" },
1295
{ UTF16, "utf-16", "utf16" },
1297
#if SUPPORT_ASIAN_ENCODINGS
1298
{ BIG5, "big5", "big5" },
1299
{ SHIFTJIS, "shift_jis", "shiftjis"},
1301
#ifndef NO_NATIVE_ISO2022_SUPPORT
1302
{ ISO2022, NULL, "iso2022" },
1304
{ RAW, NULL, "raw" }
1307
ctmbstr GetEncodingNameFromTidyId(uint id)
1311
for (i = 0; enc2iana[i].name; ++i)
1312
if (enc2iana[i].id == id)
1313
return enc2iana[i].name;
1318
ctmbstr GetEncodingOptNameFromTidyId(uint id)
1322
for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1323
if (enc2iana[i].id == id)
1324
return enc2iana[i].tidyOptName;
1329
int GetCharEncodingFromOptName( ctmbstr charenc )
1333
for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1334
if (tmbstrcasecmp(charenc, enc2iana[i].tidyOptName) == 0 )
1335
return enc2iana[i].id;