3
"$Header: d:/cvsroot/tads/tads3/TCSRC.CPP,v 1.3 1999/07/11 00:46:55 MJRoberts Exp $";
7
* Copyright (c) 1999, 2002 Michael J. Roberts. All Rights Reserved.
9
* Please see the accompanying license file, LICENSE.TXT, for information
10
* on using and copying this software.
14
tcsrc.cpp - source file reader
20
04/13/99 MJRoberts - Creation
33
/* ------------------------------------------------------------------------ */
37
CTcSrcFile::~CTcSrcFile()
39
/* close my source file */
43
/* release my character mapper */
45
mapper_->release_ref();
50
// we don't currently need this, but keep the source in case it
51
// becomes interesting later
53
/* ------------------------------------------------------------------------ */
55
* Open a plain ASCII file, with no #charset marker.
57
CTcSrcFile *CTcSrcFile::open_plain(const char *filename)
64
* open the file in binary mode, since we do all of the newline
65
* interpretation explicitly
67
if ((fp = osfoprb(filename, OSFTTEXT)) == 0)
70
/* read the first few bytes of the file */
71
siz = osfrbc(fp, buf, sizeof(buf));
73
/* check for a 3-byte UTF-8 marker */
75
&& (uchar)buf[0] == 0xEF
76
&& (uchar)buf[1] == 0xBB
77
&& (uchar)buf[2] == 0xBF)
80
* seek to the byte after the marker, so that our caller won't see
83
osfseek(fp, 3, OSFSK_SET);
85
/* return a source file reader with a utf-8 mapper */
86
return new CTcSrcFile(fp, new CCharmapToUniUTF8());
89
/* if we read at least two bytes, try auto-detecting UCS-2 */
92
/* if the first bytes are 0xFF 0xFE, it's UCS-2 low-byte first */
93
if ((unsigned char)buf[0] == 0xFF && (unsigned char)buf[1] == 0xFE)
95
/* seek to the byte after the marker */
96
osfseek(fp, 2, OSFSK_SET);
98
/* return a reader with a little-endian mapper */
99
return new CTcSrcFile(fp, new CCharmapToUniUcs2Little());
102
/* if the first bytes are 0xFE 0xFF, it's UCS-2 high-byte first */
103
if ((unsigned char)buf[0] == 0xFE && (unsigned char)buf[1] == 0xFF)
105
/* seek to the byte after the marker */
106
osfseek(fp, 2, OSFSK_SET);
108
/* return a reader with a little-endian mapper */
109
return new CTcSrcFile(fp, new CCharmapToUniUcs2Big());
114
* there are no Unicode markers, so our only remaining option is plain
115
* ASCII - return a source file object with a plain ASCII mapper
117
return new CTcSrcFile(fp, new CCharmapToUniASCII());
121
/* ------------------------------------------------------------------------ */
123
* Open a plain ASCII source file.
125
CTcSrcFile *CTcSrcFile::open_ascii(const char *filename)
130
* open the file in binary mode, since we do all of the newline
131
* interpretation explicitly
133
if ((fp = osfoprb(filename, OSFTTEXT)) == 0)
136
/* return a source reader with a plain ASCII mapper */
137
return new CTcSrcFile(fp, new CCharmapToUniASCII());
141
/* ------------------------------------------------------------------------ */
145
CTcSrcFile *CTcSrcFile::open_source(const char *filename,
146
class CResLoader *res_loader,
147
const char *default_charset,
149
int *default_charset_error)
155
CCharmapToUni *mapper;
157
/* presume we won't find an invalid #charset directive */
158
*charset_error = FALSE;
160
/* presume we'll have no problem with the default character set */
161
*default_charset_error = FALSE;
164
* open the file in binary mode, so that we can scan the first few
165
* bytes to see if we can detect the character set from information
166
* at the beginning of the file
168
fp = osfoprb(filename, OSFTTEXT);
170
/* if we couldn't open the file, return failure */
174
/* note the starting offset in the file */
175
startofs = osfpos(fp);
177
/* read the first few bytes of the file */
178
siz = osfrbc(fp, buf, sizeof(buf));
180
/* check for a 3-byte UTF-8 byte-order marker */
181
if (siz >= 3 && (uchar)buf[0] == 0xEF && (uchar)buf[1] == 0xBB
182
&& (uchar)buf[2] == 0xBF)
188
/* skip at least the three-byte marker sequence */
192
* check for a #charset marker for utf-8 - this would be redundant,
197
if (rem > 9 && memcmp(p, "#charset ", 9) == 0)
200
for (p += 9, rem -= 9 ; rem != 0 && (*p == ' ' || *p == '\t') ;
203
/* check for valid character set markers */
204
if (rem >= 7 && memicmp(p, "\"utf-8\"", 7) == 0)
206
/* skip the whole sequence */
207
skip = (p + 7) - buf;
209
else if (rem >= 6 && memicmp(p, "\"utf8\"", 6) == 0)
211
/* skip the whole sequence */
212
skip = (p + 6) - buf;
216
/* seek past the character set markers */
217
osfseek(fp, startofs + skip, OSFSK_SET);
219
/* return a new utf-8 decoder */
220
return new CTcSrcFile(fp, new CCharmapToUniUTF8());
223
/* if we read at least two bytes, try auto-detecting unicode */
227
const char *const *cs_names;
230
/* presume we won't find a byte-order marker */
233
/* if the first bytes are 0xFF 0xFE, it's UCS-2 low-byte first */
234
if ((unsigned char)buf[0] == 0xFF && (unsigned char)buf[1] == 0xFE)
236
static const char *names[] = { "unicodel", "utf-16le", 0 };
238
/* create a UCS-2 little-endian reader */
239
srcf = new CTcSrcFile(fp, new CCharmapToUniUcs2Little());
244
/* if the first bytes are 0xFE 0xFF, it's UCS-2 high-byte first */
245
if ((unsigned char)buf[0] == 0xFE && (unsigned char)buf[1] == 0xFF)
247
static const char *names[] = { "unicodeb", "utf-16be", 0 };
249
/* create a UCS-2 little-endian reader */
250
srcf = new CTcSrcFile(fp, new CCharmapToUniUcs2Big());
255
/* if we found the byte-order marker, we know the character set */
260
/* we at least want to skip the byte-order marker */
263
/* check to see if we have a '#charset' directive */
264
if (ucs_str_starts_with(buf + 2, siz - 2, "#charset ",
270
/* scan past following spaces */
271
for (p = buf + 2 + 18, rem = siz - 2 - 18 ;
272
rem >= 2 && (ucs_char_eq(p, ' ', bige, FALSE)
273
|| ucs_char_eq(p, '\t', bige, FALSE)) ;
276
/* check for a '"' */
277
if (rem >= 2 && ucs_char_eq(p, '"', bige, FALSE))
279
const char *const *n;
286
* check for a match to any of the valid names for this
289
for (n = cs_names ; *n != 0 ; ++n)
291
/* if it's a match, stop scanning */
292
if (ucs_str_starts_with(p, rem, *n, bige, TRUE))
296
/* get the length of the name */
299
/* check for a close quote */
301
&& ucs_char_eq(p + l, '"', bige, FALSE))
303
/* skip the name and the quote */
307
/* skip the source text to this point */
318
/* seek just past the character set indicators */
319
osfseek(fp, startofs + skip, OSFSK_SET);
321
/* return the file */
327
* It doesn't appear to use UCS-2 encoding (at least, the file
328
* doesn't start with a byte-order sensing sequence). Check to see
329
* if the file starts with "#charset " in ASCII single-byte
332
if (siz >= 9 && memcmp(buf, "#charset ", 9) == 0)
337
/* skip the #charset string and any following spaces */
338
for (p = buf + 9, rem = siz - 9 ;
339
rem > 0 && (*p == ' ' || *p == '\t') ; ++p, --rem) ;
341
/* make sure we're looking at a '"' */
342
if (rem != 0 && *p == '"')
346
/* skip the open quote */
350
/* remember where the character set name starts */
354
* find the closing quote, which must occur before a CR or
357
for ( ; rem > 0 && *p != '"' && *p != 10 && *p != 13 ;
360
/* make sure we found a matching quote */
361
if (rem != 0 && *p == '"')
363
/* seek just past the #charset string */
364
osfseek(fp, startofs + (p - buf) + 1, OSFSK_SET);
367
* put a null terminator at the end of the character set
372
/* create a mapper */
373
mapper = CCharmapToUni::load(res_loader, charset_name);
376
* if that succeeded, return a reader for the mapper;
377
* otherwise, simply proceed as though no #charset had
378
* been present, so that we create a default mapper
382
/* success - return a reader */
383
return new CTcSrcFile(fp, mapper);
387
/* tell the caller the #charset was invalid */
388
*charset_error = TRUE;
395
* we didn't find any sensing codes, so seek back to the start of
398
osfseek(fp, startofs, OSFSK_SET);
401
* We couldn't identify the file's character set based on anything
402
* in the file, so create a mapper for the given default character
403
* set. If there's not even a default character set defined, create
404
* a plain ASCII mapper.
406
if (default_charset != 0)
407
mapper = CCharmapToUni::load(res_loader, default_charset);
409
mapper = new CCharmapToUniASCII();
411
/* check to see if we created a mapper */
414
/* return a source file reader based on the mapper */
415
return new CTcSrcFile(fp, mapper);
420
* we failed to create a mapper for the default character set -
423
*default_charset_error = TRUE;
425
/* close the input file */
433
/* ------------------------------------------------------------------------ */
435
* Read a line of text from the file.
437
size_t CTcSrcFile::read_line(char *buf, size_t bufl)
441
/* start out writing to the start of the caller's buffer */
445
* Keep going until we run out of input file, fill up the buffer, or
446
* reach the end of a line
452
/* read some more data if our buffer is empty */
455
/* load another buffer-full */
456
rem_ = mapper_->read_file(fp_, buf_, sizeof(buf_), 0);
459
* If we didn't read anything, we've reached the end of the
460
* file. If we've already copied anything into the caller's
461
* buffer, null-terminate their buffer and return success;
462
* otherwise, return failure, since the caller has already
463
* read everything available from the file.
468
* Remember that we've reached the end of the file.
469
* We're about to return the last of the data, so the
470
* caller will not need to call us again (although it's
471
* legal if they do - we'll just return a zero length on
476
/* check if we've copied anything to the caller's buffer */
479
/* the caller's buffer is empty - return end of file */
484
/* null-terminate the caller's buffer */
488
* return the number of bytes copied, including the null
495
/* start over at the beginning of the buffer */
500
* Scan the input buffer one character (not byte) at a time.
501
* Keep track of how much many bytes we've skipped. Stop when
502
* we reach a CR or LF character, or when skipping another
503
* character would exceed the remaining capacity of the caller's
504
* buffer, or when we run out of data in our input buffer.
506
for (src = p_ ; rem_ > 0 ; )
510
/* get the length of the current character */
511
csiz = utf8_ptr::s_charsize(*src);
514
* if this character plus a null terminator wouldn't fit in
515
* the output buffer, stop scanning
520
* There's no more room in the caller's buffer. Copy
521
* what we've scanned so far to the output buffer and
522
* null-terminate the buffer.
524
memcpy(dst, p_, src - p_);
526
/* advance past the copied bytes and write the null byte */
530
/* advance the buffer read pointer over the copied bytes */
533
/* return success - indicate the number of bytes copied */
538
* If it's a newline character of some kind, we're done with
539
* this line. Note that we can just check the byte directly,
540
* since if it's a multi-byte character, we'll never mistake
541
* the first byte for a single-byte newline or carriage return
542
* character, since a UTF-8 lead byte always has the high bit
545
* Also treat the Unicode character 0x2028 (line separator) as
548
if (*src == '\n' || *src == '\r'
549
|| utf8_ptr::s_getch(src) == 0x2028)
553
/* copy what we've scanned so far to the caller's buffer */
554
memcpy(dst, p_, src - p_);
556
/* advance past the copied bytes */
560
* add a newline to the caller's buffer -- always add a
561
* '\n' newline, regardless of what kind of newline
562
* sequence we found in the input; also add a null
568
/* remember which type of newline we found */
571
/* advance past the newline */
576
* If the input buffer is empty, read more, so that we
577
* can check the next character after the newline
583
rem_ = mapper_->read_file(fp_, buf_, sizeof(buf_), 0);
585
/* start over at the start of the buffer */
590
* Check for a paired newline character. If we found a
591
* CR, check for an LF; if we found an LF, check for a
592
* CR. This will ensure that we will recognize
593
* essentially any newline character sequence for any
594
* platform - this will accept CR, LF, CR-LF, or LF-CR
598
&& ((nl == '\n' && *p_ == '\r')
599
|| (nl == '\r' && *p_ == '\n')))
601
/* it's a paired newline - skip the second character */
606
/* we've finished this line - return success */
610
/* skip this character in the input and proceed */
614
/* consider this character consumed in the caller's buffer */
619
* We've exhausted the current input buffer, without filling the
620
* caller's buffer. Copy what we've skipped so far into the
623
memcpy(dst, p_, src - p_);
626
* Advance the output pointer past the data we just copied, then
627
* continue looping to read more data from the input file.
633
/* ------------------------------------------------------------------------ */
635
* Buffer reader source object
641
CTcSrcMemory::CTcSrcMemory(const char *buf, CCharmapToUni *mapper)
647
/* get the length of the null-terminated source string */
651
* Allocate a buffer for a UTF8-encoded copy of the buffer -
652
* allocate three bytes per byte of the original, since this is the
653
* worst case for expansion of the encoding. Allocate one extra
654
* byte to ensure we have space for a null terminator.
657
buf_alo_ = (char *)t3malloc(alo_len + 1);
661
mapper->map(&p, &alo_len, buf, len);
663
/* null-terminate the translated buffer */
666
/* start reading at the start of the translated buffer */
673
CTcSrcMemory::~CTcSrcMemory()
675
/* free our buffer */
682
size_t CTcSrcMemory::read_line(char *buf, size_t bufl)
687
/* if there's nothing left in our buffer, return EOF */
691
/* start out writing to the start of the caller's buffer */
695
* Scan the input buffer one character (not byte) at a time. Keep
696
* track of how much many bytes we've skipped. Stop when we reach a
697
* CR or LF character, or when skipping another character would
698
* exceed the remaining capacity of the caller's buffer, or when we
699
* run out of data in our input buffer.
701
for (src = buf_ ; *src != '\0' ; )
705
/* get the length of the current character */
706
csiz = utf8_ptr::s_charsize(*src);
709
* if this character plus a null terminator wouldn't fit in the
710
* output buffer, stop scanning
715
* There's no more room in the caller's buffer. Copy what
716
* we've scanned so far to the output buffer and
717
* null-terminate the buffer.
719
memcpy(dst, buf_, src - buf_);
721
/* advance past the copied bytes and write the null byte */
725
/* advance the buffer read pointer over the copied bytes */
728
/* return success - indicate the number of bytes copied */
733
* If it's a newline character of some kind, we're done with this
734
* line. Note that we can just check the byte directly, since if
735
* it's a multi-byte character, we'll never mistake the first byte
736
* for a single-byte newline or carriage return character, since a
737
* UTF-8 lead byte always has the high bit set. Allow Unicode
738
* character 0x2028 (line separator) as a newline as well.
740
if (*src == '\n' || *src == '\r' || utf8_ptr::s_getch(src) == 0x2028)
744
/* copy what we've scanned so far to the caller's buffer */
745
memcpy(dst, buf_, src - buf_);
747
/* advance past the copied bytes */
751
* add a newline to the caller's buffer -- always add a '\n'
752
* newline, regardless of what kind of newline sequence we
753
* found in the input; also add a null terminator
758
/* remember which type of newline we found */
761
/* advance past the newline */
765
* Check for a paired newline character. If we found a CR,
766
* check for an LF; if we found an LF, check for a CR. This
767
* will ensure that we will recognize essentially any
768
* newline character sequence for any platform - this will
769
* accept CR, LF, CR-LF, or LF-CR sequences.
771
if ((nl == '\n' && *buf_ == '\r')
772
|| (nl == '\r' && *buf_ == '\n'))
774
/* it's a paired newline - skip the second character */
778
/* we've finished this line - return its length */
782
/* skip this character in the input and proceed */
785
/* consider this space consumed in the caller's buffer */
790
* We've exhausted the input buffer, without filling the caller's
791
* buffer. Copy what we've skipped so far into the caller's buffer.
793
memcpy(dst, buf_, src - buf_);
796
/* null-terminate the result buffer */
799
/* advance our input pointer to the new (EOF) position */
802
/* return the buffer length */