1
1
/* streamio.c -- handles character stream I/O
3
(c) 1998-2005 (W3C) MIT, ERCIM, Keio University
3
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4
4
See tidy.h for the copyright notice.
9
$Date: 2005/07/22 15:54:58 $
9
$Date: 2007/07/22 09:33:26 $
12
12
Wrapper around Tidy input source and output sink
13
13
that calls appropriate interfaces, and applies
46
46
static void EncodeIbm858( uint c, StreamOut* out );
47
47
static void EncodeLatin0( uint c, StreamOut* out );
49
static uint DecodeIbm850(uint c);
50
static uint DecodeLatin0(uint c);
52
static uint PopChar( StreamIn *in );
49
54
/******************************
50
55
** Static (duration) Globals
51
56
******************************/
70
75
#ifdef TIDY_WIN32_MLANG_SUPPORT
74
{ 0, filesink_putByte }
79
{ 0, TY_(filesink_putByte) }
77
StreamOut* StdErrOutput(void)
82
StreamOut* TY_(StdErrOutput)(void)
79
84
if ( stderrStreamOut.sink.sinkData == 0 )
80
stderrStreamOut.sink.sinkData = (ulong) stderr;
85
stderrStreamOut.sink.sinkData = stderr;
81
86
return &stderrStreamOut;
84
StreamOut* StdOutOutput(void)
90
StreamOut* TY_(StdOutOutput)(void)
86
92
if ( stdoutStreamOut.sink.sinkData == 0 )
87
stdoutStreamOut.sink.sinkData = (ulong) stdout;
93
stdoutStreamOut.sink.sinkData = stdout;
88
94
return &stdoutStreamOut;
91
void ReleaseStreamOut( StreamOut* out )
98
void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out )
93
100
if ( out && out != &stderrStreamOut && out != &stdoutStreamOut )
95
102
if ( out->iotype == FileIO )
96
103
fclose( (FILE*) out->sink.sinkData );
104
TidyDocFree( doc, out );
102
108
/************************
104
110
************************/
106
static StreamIn* initStreamIn( TidyDocImpl* doc, int encoding )
112
static void InitLastPos( StreamIn *in );
114
StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding )
108
StreamIn *in = (StreamIn*) MemAlloc( sizeof(StreamIn) );
116
StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) );
110
ClearMemory( in, sizeof(StreamIn) );
118
TidyClearMemory( in, sizeof(StreamIn) );
113
121
in->encoding = encoding;
114
122
in->state = FSM_ASCII;
116
124
in->bufsize = CHARBUF_SIZE;
117
in->charbuf = (tchar*)MemAlloc(sizeof(tchar) * in->bufsize);
125
in->allocator = doc->allocator;
126
in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize);
118
128
#ifdef TIDY_STORE_ORIGINAL_TEXT
119
129
in->otextbuf = NULL;
120
130
in->otextlen = 0;
126
void freeStreamIn(StreamIn* in)
136
void TY_(freeStreamIn)(StreamIn* in)
128
138
#ifdef TIDY_STORE_ORIGINAL_TEXT
129
139
if (in->otextbuf)
130
MemFree(in->otextbuf);
140
TidyFree(in->allocator, in->otextbuf);
132
MemFree(in->charbuf);
142
TidyFree(in->allocator, in->charbuf);
143
TidyFree(in->allocator, in);
136
StreamIn* FileInput( TidyDocImpl* doc, FILE *fp, int encoding )
146
StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding )
138
StreamIn *in = initStreamIn( doc, encoding );
139
initFileSource( &in->source, fp );
148
StreamIn *in = TY_(initStreamIn)( doc, encoding );
149
if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 )
151
TY_(freeStreamIn)( in );
140
154
in->iotype = FileIO;
144
StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
158
StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding )
146
StreamIn *in = initStreamIn( doc, encoding );
147
initInputBuffer( &in->source, buf );
160
StreamIn *in = TY_(initStreamIn)( doc, encoding );
161
tidyInitInputBuffer( &in->source, buf );
148
162
in->iotype = BufferIO;
152
StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding )
166
StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding )
154
StreamIn *in = initStreamIn( doc, encoding );
168
StreamIn *in = TY_(initStreamIn)( doc, encoding );
155
169
memcpy( &in->source, source, sizeof(TidyInputSource) );
156
170
in->iotype = UserIO;
160
int ReadBOMEncoding(StreamIn *in)
174
int TY_(ReadBOMEncoding)(StreamIn *in)
163
177
#if SUPPORT_UTF16_ENCODINGS
194
208
/* little-endian UTF-16 */
195
209
if (in->encoding != UTF16 && in->encoding != UTF16LE)
196
ReportEncodingWarning(in->doc, ENCODING_MISMATCH, UTF16LE);
210
TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE);
198
212
return UTF16LE; /* return decoded BOM */
230
244
#ifdef TIDY_STORE_ORIGINAL_TEXT
231
void AddByteToOriginalText(StreamIn *in, tmbchar c)
245
void TY_(AddByteToOriginalText)(StreamIn *in, tmbchar c)
233
247
if (in->otextlen + 1 >= in->otextsize)
235
249
size_t size = in->otextsize ? 1 : 2;
236
in->otextbuf = MemRealloc(in->otextbuf, in->otextsize + size);
250
in->otextbuf = TidyRealloc(in->allocator, in->otextbuf, in->otextsize + size);
237
251
in->otextsize += size;
239
253
in->otextbuf[in->otextlen++] = c;
240
254
in->otextbuf[in->otextlen ] = 0;
243
void AddCharToOriginalText(StreamIn *in, tchar c)
257
void TY_(AddCharToOriginalText)(StreamIn *in, tchar c)
245
259
int i, err, count = 0;
246
260
tmbchar buf[10] = {0};
248
err = EncodeCharToUTF8Bytes(c, buf, NULL, &count);
262
err = TY_(EncodeCharToUTF8Bytes)(c, buf, NULL, &count);
259
273
for (i = 0; i < count; ++i)
260
AddByteToOriginalText(in, buf[i]);
274
TY_(AddByteToOriginalText)(in, buf[i]);
265
uint ReadChar( StreamIn *in )
278
static void InitLastPos( StreamIn *in )
281
in->firstlastpos = 0;
284
static void PopLastPos( StreamIn *in )
286
in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE;
287
if ( in->curlastpos == in->firstlastpos )
288
in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE;
291
static void SaveLastPos( StreamIn *in )
294
in->lastcols[in->curlastpos] = in->curcol;
297
static void RestoreLastPos( StreamIn *in )
299
if ( in->firstlastpos == in->curlastpos )
303
in->curcol = in->lastcols[in->curlastpos];
304
if ( in->curlastpos == 0 )
305
in->curlastpos = LASTPOS_SIZE;
310
uint TY_(ReadChar)( StreamIn *in )
267
312
uint c = EndOfStream;
268
313
uint tabsize = cfg( in->doc, TidyTabSize );
318
365
#ifdef TIDY_STORE_ORIGINAL_TEXT
320
AddCharToOriginalText(in, (tchar)c);
367
TY_(AddCharToOriginalText)(in, (tchar)c);
322
369
c = ReadCharFromStream(in);
372
TY_(UngetChar)( c, in );
330
377
#ifdef TIDY_STORE_ORIGINAL_TEXT
331
AddCharToOriginalText(in, (tchar)c);
378
TY_(AddCharToOriginalText)(in, (tchar)c);
375
422
in->encoding == UTF16 ||
376
423
in->encoding == UTF16BE )
378
if ( !IsValidUTF16FromUCS4(c) )
425
if ( !TY_(IsValidUTF16FromUCS4)(c) )
380
427
/* invalid UTF-16 value */
381
ReportEncodingError(in->doc, INVALID_UTF16, c, yes);
428
TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes);
384
else if ( IsLowSurrogate(c) )
431
else if ( TY_(IsLowSurrogate)(c) )
387
434
uint m = ReadCharFromStream( in );
389
436
return EndOfStream;
392
if ( IsHighSurrogate(m) )
439
if ( TY_(IsHighSurrogate)(m) )
394
n = CombineSurrogatePair( m, n );
395
if ( IsValidCombinedChar(n) )
441
n = TY_(CombineSurrogatePair)( m, n );
442
if ( TY_(IsValidCombinedChar)(n) )
398
445
/* not a valid pair */
400
ReportEncodingError( in->doc, INVALID_UTF16, c, yes );
447
TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes );
424
471
Bool isVendorChar = ( in->encoding == WIN1252 ||
425
472
in->encoding == MACROMAN );
426
473
Bool isWinChar = ( in->encoding == WIN1252 ||
427
ReplacementCharEncoding == WIN1252 );
474
TY_(ReplacementCharEncoding) == WIN1252 );
428
475
Bool isMacChar = ( in->encoding == MACROMAN ||
429
ReplacementCharEncoding == MACROMAN );
476
TY_(ReplacementCharEncoding) == MACROMAN );
431
478
/* set error position just before offending character */
432
in->doc->lexer->lines = in->curline;
433
in->doc->lexer->columns = in->curcol;
481
in->doc->lexer->lines = in->curline;
482
in->doc->lexer->columns = in->curcol;
436
c1 = DecodeWin1252( c );
486
c1 = TY_(DecodeWin1252)( c );
437
487
else if ( isMacChar )
438
c1 = DecodeMacRoman( c );
488
c1 = TY_(DecodeMacRoman)( c );
440
490
replMode = REPLACED_CHAR;
442
492
if ( c1 == 0 && isVendorChar )
443
ReportEncodingError(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
493
TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR);
444
494
else if ( ! isVendorChar )
445
ReportEncodingError(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
495
TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR);
494
546
in->pushed = yes;
496
548
if (in->bufpos + 1 >= in->bufsize)
497
in->charbuf = (tchar*)MemRealloc(in->charbuf, sizeof(tchar) * ++(in->bufsize));
549
in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize));
499
551
in->charbuf[(in->bufpos)++] = c;
504
in->curcol = in->lastcol;
556
RestoreLastPos( in );
511
563
************************/
513
static StreamOut* initStreamOut( int encoding, uint nl )
565
static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl )
515
StreamOut* out = (StreamOut*) MemAlloc( sizeof(StreamOut) );
516
ClearMemory( out, sizeof(StreamOut) );
567
StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) );
568
TidyClearMemory( out, sizeof(StreamOut) );
517
569
out->encoding = encoding;
518
570
out->state = FSM_ASCII;
523
StreamOut* FileOutput( FILE* fp, int encoding, uint nl )
575
StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl )
525
StreamOut* out = initStreamOut( encoding, nl );
526
initFileSink( &out->sink, fp );
577
StreamOut* out = initStreamOut( doc, encoding, nl );
578
TY_(initFileSink)( &out->sink, fp );
527
579
out->iotype = FileIO;
530
StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint nl )
582
StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl )
532
StreamOut* out = initStreamOut( encoding, nl );
533
initOutputBuffer( &out->sink, buf );
584
StreamOut* out = initStreamOut( doc, encoding, nl );
585
tidyInitOutputBuffer( &out->sink, buf );
534
586
out->iotype = BufferIO;
537
StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint nl )
589
StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl )
539
StreamOut* out = initStreamOut( encoding, nl );
591
StreamOut* out = initStreamOut( doc, encoding, nl );
540
592
memcpy( &out->sink, sink, sizeof(TidyOutputSink) );
541
593
out->iotype = UserIO;
545
void WriteChar( uint c, StreamOut* out )
597
void TY_(WriteChar)( uint c, StreamOut* out )
547
599
/* Translate outgoing newlines */
550
602
if ( out->nl == TidyCRLF )
551
WriteChar( CR, out );
603
TY_(WriteChar)( CR, out );
552
604
else if ( out->nl == TidyCR )
577
EncodeCharToUTF8Bytes( c, NULL, &out->sink, &count );
629
TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count );
580
/* ReportEncodingError(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
632
/* TY_(ReportEncodingError)(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */
581
633
/* replacement char 0xFFFD encoded as UTF-8 */
582
634
PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out);
633
685
int i, numChars = 1;
634
686
uint theChars[2];
636
if ( !IsValidUTF16FromUCS4(c) )
688
if ( !TY_(IsValidUTF16FromUCS4)(c) )
638
690
/* invalid UTF-16 value */
639
/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
691
/* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
643
else if ( IsCombinedChar(c) )
695
else if ( TY_(IsCombinedChar)(c) )
645
697
/* output both, unless something goes wrong */
647
if ( !SplitSurrogatePair(c, &theChars[0], &theChars[1]) )
699
if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) )
649
/* ReportEncodingError(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
701
/* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */
1195
1249
int err, count = 0;
1197
1251
/* first byte "c" is passed in separately */
1198
err = DecodeUTF8BytesToChar( &n, c, NULL, &in->source, &count );
1252
err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count );
1199
1253
if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */
1200
1254
return EndOfStream;
1204
1258
in->doc->lexer->lines = in->curline;
1205
1259
in->doc->lexer->columns = in->curcol;
1207
ReportEncodingError(in->doc, INVALID_UTF8, n, no);
1261
TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no);
1208
1262
n = 0xFFFD; /* replacement char */
1246
1300
#ifdef TIDY_WIN32_MLANG_SUPPORT
1247
1301
else if (in->encoding > WIN32MLANG)
1249
assert( in->mlang != 0 );
1250
return Win32MLangGetChar((byte)c, in, &bytesRead);
1303
assert( in->mlang != NULL );
1304
return TY_(Win32MLangGetChar)((byte)c, in, &bytesRead);
1329
int GetCharEncodingFromOptName( ctmbstr charenc )
1383
int TY_(GetCharEncodingFromOptName)( ctmbstr charenc )
1333
1387
for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i)
1334
if (tmbstrcasecmp(charenc, enc2iana[i].tidyOptName) == 0 )
1388
if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 )
1335
1389
return enc2iana[i].id;
1397
* indent-tabs-mode: nil
1399
* eval: (c-set-offset 'substatement-open 0)