1
/* $Header: d:/cvsroot/tads/tads3/charmap.h,v 1.2 1999/05/17 02:52:29 MJRoberts Exp $ */
4
* Copyright (c) 1998, 2002 Michael J. Roberts. All Rights Reserved.
6
* Please see the accompanying license file, LICENSE.TXT, for information
7
* on using and copying this software.
11
charmap.h - character-set mapper
13
Provides mappings between 16-bit Unicode and single-byte, multi-byte,
14
and double-byte character sets.
18
10/17/98 MJRoberts - Creation
33
/* ------------------------------------------------------------------------ */
35
* Mapping Types. This enum provides a characterization of a local
36
* character set (as defined in a mapping file).
41
* Single-byte character set - each character is represented with a
47
* Double-byte character set - each character is represented with
48
* exactly two 8-bit bytes. In each byte pair, the first byte is
49
* taken as the high-order byte, so a text input stream consisting
50
* of the bytes 0x12, 0x34, 0x56, 0x78 would be interpreted as the
51
* two 16-bit code point values 0x1234, 0x5678.
56
* Mixed multi-byte - each character is represented by either one or
57
* two 8-bit bytes. Each two-byte character starts with a byte that
58
* is only used in two-byte characters; each one-byte character
59
* consists of a single byte that is not used as the first byte of
60
* any two-byte character. In each two-byte character, the first
61
* byte is taken as the high-order byte.
63
* For example, assuming that 0x00-0x7F are defined as single-byte
64
* characters, and 0x8000-0xFFFF are defined as double-byte
65
* characters, the byte sequence 0x12, 0x81, 0xAB, 0x82, 0xCD, 0x34
66
* would be taken as the character sequence 0x12, 0x81AB, 0x82CD,
72
/* ------------------------------------------------------------------------ */
74
* Basic character mapper class.
80
void add_ref() { ++ref_cnt_; }
82
/* release a reference; delete on removing the last reference */
85
/* count the unreference */
88
/* if that leaves no references, delete me */
96
/* start out with one reference, for the initial creator */
100
virtual ~CCharmap() { }
103
* Open and characterize a mapping file. Returns the osfildef
104
* pointer if the file was successfully opened and parsed, or null
105
* if not. Sets *map_type to indicate the type of mapping contained
108
static osfildef *open_map_file(class CResLoader *res_loader,
109
const char *table_name,
110
charmap_type_t *map_type);
112
/* check a name to see if it matches one of the names for ASCII */
113
static int name_is_ascii_synonym(const char *table_name)
115
/* accept any of the various synonyms for ASCII */
116
return (stricmp(table_name, "us-ascii") == 0
117
|| stricmp(table_name, "asc7dflt") == 0
118
|| stricmp(table_name, "ascii") == 0
119
|| stricmp(table_name, "iso646-us") == 0
120
|| stricmp(table_name, "iso-ir-6") == 0
121
|| stricmp(table_name, "cp367") == 0
122
|| stricmp(table_name, "us") == 0);
125
/* check a name to see if it matches one of the names for ISO 8859-1 */
126
static int name_is_8859_1_synonym(const char *table_name)
128
/* accept any of the various names for ISO 8859-1 */
129
return (stricmp(table_name, "iso-8859-1") == 0
130
|| stricmp(table_name, "iso_8859-1") == 0
131
|| stricmp(table_name, "iso-ir-100") == 0
132
|| stricmp(table_name, "latin1") == 0
133
|| stricmp(table_name, "l1") == 0
134
|| stricmp(table_name, "cp819") == 0);
137
/* reference count */
138
unsigned int ref_cnt_;
141
/* ------------------------------------------------------------------------ */
143
* Base character mapper class for mapping from a local character set to
144
* UTF-8. This is an abstract interface that must be implemented for
145
* different classes of character sets.
147
class CCharmapToUni: public CCharmap
154
* Create a mapping object for a given character table. We'll read
155
* enough of the character table to determine the appropriate
156
* concrete subclass to instantiate, then create an object, load the
157
* table into the object, and return the object. The caller is
158
* responsible for deleting the object when finished with it.
160
* Returns null if the mapping file cannot be loaded.
162
static CCharmapToUni *load(class CResLoader *res_loader,
163
const char *table_name);
166
* Determine if the given byte sequence forms a complete character in
167
* the local character set. Returns true if so, false if not. 'len'
168
* must be at least 1.
170
virtual int is_complete_char(const char *p, size_t len) const = 0;
173
* Convert a string from the local character set to Unicode.
174
* Returns the byte length of the output. If the output buffer is
175
* too small to store the result, we will return the size of the
176
* full result, but we won't write past the end of the buffer.
178
* We'll advance *output_ptr by the number of bytes we write.
180
* If we store anything, we'll decrement *output_buf_len by the
181
* number of bytes we store; if we don't have enough room, we'll set
182
* *output_buf_len to zero.
184
* input_ptr is a pointer to the input string; input_len is the
185
* length in bytes of the input string.
187
virtual size_t map(char **output_ptr, size_t *output_buf_len,
188
const char *input_ptr, size_t input_len) const = 0;
191
* Convert a string from the local character set to Unicode.
193
* This works the same way as map(), but additionally provides
194
* information on the consumption of source bytes by filling in
195
* partial_len with the number of bytes at the end of the source
196
* buffer that are not mappable because they do not form complete
197
* characters in the source character set. Since we scan all input
198
* regardless of whether there's space to store the resulting output,
199
* this will reflect the same number of bytes no matter what the
200
* output buffer length.
202
virtual size_t map2(char **output_ptr, size_t *output_buf_len,
203
const char *input_ptr, size_t input_len,
204
size_t *partial_len) const = 0;
207
* Map a null-terminated string into a buffer; returns the number of
208
* bytes of the buffer actually needed to store the string. If the
209
* entire string couldn't be mapped, this will return a number
210
* greater than or equal to the output buffer size, but we will not
211
* write beyond the end of the buffer.
213
* If there's space, the result will be null-terminated; however,
214
* the null terminator byte will not be included in the result
215
* length. If the return value exactly equals outbuflen, it means
216
* that the string exactly fills the buffer, hence there isn't space
217
* for a null terminator.
219
size_t map_str(char *outbuf, size_t outbuflen, const char *input_str);
222
* Read characters from a file into a buffer, translating the
223
* characters to UTF-8. Returns the number of bytes copied into the
224
* buffer; returns zero on end of file. The buffer must be at least
225
* three bytes long to ensure that at least one character can be read
226
* from the file (the longest UTF-8 character takes up three bytes),
227
* since it would otherwise not be possible to distinguish reaching
228
* the end of the file from simply being unable to fit even one
229
* character into the buffer.
231
* The file can be opened in text or binary mode; we don't pay any
232
* attention to newline sequences, so the mode is not relevant to us.
234
* This routine may read fewer than the desired number of bytes. Upon
235
* return, the file's seek position should be set to the next byte of
236
* the file after the last character copied into the output buffer.
238
* 'read_limit' is the maximum number of bytes we're allowed to read
239
* from the underlying file. If this is zero, then the read size is
242
virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
243
unsigned long read_limit) = 0;
246
/* delete the mapping */
247
virtual ~CCharmapToUni() { }
249
/* load the mapping table from the file */
250
void load_table(osfildef *fp);
253
* Set a mapping. uni_code_pt is the unicode code point, and
254
* local_code_pt is the code point in the local character set.
256
virtual void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt) = 0;
259
/* ------------------------------------------------------------------------ */
261
* Base character mapper class for mapping from Unicode UTF-8 to a local
262
* character set. This is an abstract interface that must be separately
263
* implemented for different classes of character sets.
265
* Each mapping object maintains a table of mapping tables. The master
266
* table contains an array of up to 256 sub-tables. The top 8 bits of
267
* the unicode character value give the index in the master table. Each
268
* entry in the master table is a pointer to a sub-table, or a null
269
* pointer if there are no mappings for characters in the range for that
272
* For example, unicode characters 0x0000 through 0x007f are mapped
273
* through the table obtained by getting the pointer at index 0 from the
274
* master table. Unicode characters 0x0200 through 0x02ff are in the
275
* table at master table index 2.
277
* If a master table index entry is empty (i.e., the pointer in the
278
* master table at that index is null), it means that all of the
279
* characters in the range for that master index map to the default
280
* character. Otherwise, we index into the sub-table using the
281
* low-order 8 bits of the Unicode character code to find the character
282
* mapping giving the local character set code for the Unicode value.
284
* Each entry in the mapping table is the offset of the translation of
285
* the character within the translation array. The translation array is
286
* an array of bytes. The first byte of each entry is the length in
287
* bytes of the entry (not including the length byte), followed by the
288
* bytes of the entry.
290
* The first entry in the translation array is always the default
291
* character, which is the mapping we use for characters with no other
294
class CCharmapToLocal: public CCharmap
300
/* create a mapper and load the mapping from a file */
301
static CCharmapToLocal *load(class CResLoader *res_loader,
302
const char *table_name);
305
* Convert a character from Unicode to the local character set.
306
* Stores the character's byte or bytes at the given pointer, and
307
* increments the pointer to point to the next byte after the
310
* Returns the byte length of the output. If the output buffer is
311
* not long enough to store the result, we simply return the size of
312
* the result without storing anything.
314
* If we actually store anything, we'll decrement *output_buf_len by
315
* the number of bytes we stored; if we don't have room to store
316
* anything, we'll set *output_buf_len to zero.
318
virtual size_t map(wchar_t unicode_char, char **output_ptr,
319
size_t *output_buf_len) const = 0;
322
* Simple single-character mapper - returns the byte length of the
323
* local character equivalent of the unicode character, which is
324
* written into the buffer. If the buffer isn't big enough, we'll
325
* still return the length, but won't write anything to the buffer.
327
size_t map_char(wchar_t unicode_char, char *buf, size_t buflen)
329
/* map the character */
330
return map(unicode_char, &buf, &buflen);
335
* Convert a UTF-8 string with a given byte length to the local
338
* Returns the byte length of the result. If the result is too long
339
* to fit in the output buffer, we'll return the number of bytes we
340
* actually were able to store (we'll store as much as we can, and
341
* stop when we run out of space). We'll indicate in
342
* *src_bytes_used how many bytes of the source we were able to map.
344
* If the output buffer is null, we will store nothing, but simply
345
* determine how much space it would take to store the entire string.
347
* This base class provides an implementation of this method that is
348
* suitable for all subclasses, but the method is defined as virtual
349
* so that subclasses can override it with a more tailored (and thus
350
* more efficient) implementation. The general-purpose base-class
351
* implementation must call the virtual function map() for each
354
virtual size_t map_utf8(char *dest, size_t dest_len,
355
utf8_ptr src, size_t src_byte_len,
356
size_t *src_bytes_used) const;
359
* map to utf8 - alternative interface using character buffers
360
* (rather than UTF8 pointers)
362
size_t map_utf8(char *dest, size_t dest_len,
363
const char *src, size_t src_byte_len,
364
size_t *src_bytes_used) const;
367
* Convert a null-terminated UTF-8 string to the local character set.
369
* Returns the byte length of the result. If the result is too long
370
* to fit in the output buffer, we'll return the size without storing
371
* the entire string (we'll store as much as we can, and stop when we
372
* run out of space, but continue counting the length needed; call
373
* with a destination buffer length of zero to simply determine how
374
* much space is needed for the result).
376
* The length returned does NOT include the null terminator. However,
377
* if there's room, we will null-terminate the result string. So, if
378
* the caller wants the result to be null terminated, it should make
379
* sure that the buffer contains one byte more than the space reported
380
* as necessary to store the result.
382
virtual size_t map_utf8z(char *dest, size_t dest_len, utf8_ptr src)
386
* Convert a null-terminated UTF-8 string to the local character set,
387
* filling in an 'escape' sequence for unknown characters. For each
388
* unknown character, we'll invoke the given callback to get the
389
* 'escaped' representation. Use &CCharmapToLocal::source_esc_cb, for
390
* example, to map using source-code-style escape sequences.
392
* The callback takes the unmappable character, a pointer to the output
393
* buffer, and a pointer to the length remaining. It should fill in
394
* the buffer with the escaped sequence (up to the remaining length
395
* limit), and adjust the buffer pointer and length for the space
396
* consumed. The return value is the full length required for the
397
* complete escape sequence, even if there's not enough space in the
398
* buffer to hold that many characters.
400
virtual size_t map_utf8z_esc(char *dest, size_t dest_len, utf8_ptr src,
401
size_t (*esc_fn)(wchar_t, char **, size_t *))
405
* ready-made callback for map_utf8z_esc() - map to unicode 'backslash'
406
* escape sequences ('\u1234'), as we'd use in tads source code
408
static size_t source_esc_cb(wchar_t ch, char **dest, size_t *len);
411
* Write data to a file, converting from UTF-8 to the local character
412
* set. Returns zero on success, non-zero if an error occurs writing
415
int write_file(osfildef *fp, const char *buf, size_t bufl);
418
* determine if the given Unicode character has a mapping to the local
421
virtual int is_mappable(wchar_t unicode_char) const
424
* By default, it's mappable if it has a non-default mapping in
425
* the translation table. The default mapping is always at offset
426
* zero in the translation table.
428
return (get_mapping(unicode_char) != 0);
432
* Get the display expansion for a unicode character. This returns a
433
* pointer to an array of wchar_t characters, and fills in the length
434
* variable. Returns null if there's no expansion.
436
* An "expansion" is a list of two or more unicode characters that
437
* should be substituted for the given unicode character when the
438
* character is displayed. Display expansions are normally used for
439
* visual approximations when the local character set doesn't contain
440
* an exact match for the unicode character; for example, an ASCII
441
* mapping might use the expansion "(c)" to represent the copyright
442
* circled-C symbol, or the two-character sequence "AE" to represent
445
const wchar_t *get_expansion(wchar_t unicode_char, size_t *len)
450
/* get the mapping offset in the expansion array */
451
ofs = get_exp_mapping(unicode_char);
453
/* if the mapping offset is zero, it means there's no mapping */
456
/* indicate that there's no mapping by returning null */
461
/* get the mapping pointer */
462
map = get_exp_ptr(ofs);
464
/* read the length and skip it */
465
*len = (size_t)*map++;
467
/* return the pointer to the first character of the expansion */
472
/* delete the mapping */
473
virtual ~CCharmapToLocal();
475
/* given a Unicode character, get the mapping for the character */
476
unsigned int get_mapping(wchar_t unicode_char) const
478
unsigned int *subtable;
480
/* get the mapping table */
481
subtable = get_sub_table(unicode_char);
484
* If there is no subtable, return the default character, which is
485
* always at offset zero in the translation array; otherwise, use
486
* the low-order 8 bits of the character code as the index into
487
* the subtable and return the value we find there
492
return subtable[unicode_char & 0xff];
495
/* given a Unicode character, get the expansion for the character */
496
unsigned int get_exp_mapping(wchar_t unicode_char) const
498
unsigned int *subtable;
500
/* get the mapping table */
501
subtable = get_exp_sub_table(unicode_char);
504
* if there's no subtable, return zero to indicate there's no
505
* expansion; otherwise, return the entry from the subtable
507
return (subtable == 0 ? 0 : subtable[unicode_char & 0xff]);
511
* Get a pointer to the sequence of bytes in the translation array at
514
const unsigned char *get_xlat_ptr(unsigned int ofs) const
516
return &xlat_array_[ofs];
520
* Get a pointer to the translation of a character and the length in
521
* bytes of the translation
523
const unsigned char *get_xlation(wchar_t unicode_char, size_t *map_len)
526
const unsigned char *map;
528
/* get the translation offset */
529
map = get_xlat_ptr(get_mapping(unicode_char));
531
/* read the length and skip it in the table */
532
*map_len = (size_t)*map++;
534
/* return the mapped byte sequence */
539
* get a pointer to the sequence of wchar_t values in the expansion
540
* array at a given offset
542
const wchar_t *get_exp_ptr(unsigned int ofs) const
544
return &exp_array_[ofs];
547
/* load the mapping table from a file */
548
void load_table(osfildef *fp);
551
* Given a Unicode character, get the sub-table for the character,
552
* or null if there is no sub-table for this character.
554
unsigned int *get_sub_table(wchar_t unicode_char) const
557
* use the high-order 8 bits of the unicode character as the
558
* index into the master table
560
return map_[(unicode_char >> 8) & 0xff];
564
* Given a Unicode character, get the expansion sub-table for the
565
* character. or null if there is no sub-table for the character.
567
unsigned int *get_exp_sub_table(wchar_t unicode_char) const
570
* use the high-order 8 bits of the unicode character as the index
571
* into the master table
573
return exp_map_[(unicode_char >> 8) & 0xff];
577
* Set a mapping. This allocates a new sub-table if necessary, and
578
* stores the local character mapping in the table.
580
void set_mapping(wchar_t unicode_char, unsigned int xlat_offset);
582
/* set an expansion mapping */
583
void set_exp_mapping(wchar_t unicode_char, unsigned int exp_offset);
586
* The master mapping table list. Each entry points to the
587
* sub-array that contains the mapping for the 256 characters whose
588
* high-order 8 bits give the index into this table. Each entry of
589
* the subarray is the offset within the xlat_array_ byte array of
590
* the first byte of the translation for the unicode character.
592
unsigned int *map_[256];
595
* The master expansion mapping list. This works just like map_, but
596
* points to exp_array_ entries for unicode display expansions.
598
unsigned int *exp_map_[256];
601
* The translation array. This is an array of bytes containing the
602
* translations. map_[high_8_bits][low_8_bits] contains the offset
603
* within this array of the translation of the character with the
604
* given code ((high_8_bits << 8) + low_8_bits). The first byte at
605
* this offset is the length in bytes of the translation, not
606
* counting the length byte. The remaining bytes are the bytes of
607
* the translation for the character.
609
unsigned char *xlat_array_;
611
/* size of the translation array */
612
size_t xlat_array_size_;
615
* The expansion array. This is an array of unicode characters
616
* containing the expansions for displaying unicode characters. This
617
* works just like xlat_array_: each entry in expmap_ is an index into
618
* this array, which gives the starting point in the array of the run
619
* of entries for the expansion of that character. The first character
620
* of a run is a length prefix giving the number of characters in the
627
/* ======================================================================== */
629
* Local character set - to - Unicode UTF-8 mappers
632
/* ------------------------------------------------------------------------ */
634
* Trival UTF8-to-UTF8 mapper - performs no conversions. This can be
635
* used when reading from an external data source that is itself in
636
* UTF-8 format; since this is identical to the format we use
637
* internally, no mapping is required.
639
class CCharmapToUniUTF8: public CCharmapToUni
642
/* read from a file */
643
virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
644
unsigned long read_limit);
646
/* determine if a byte sequence forms a complete character */
647
virtual int is_complete_char(const char *p, size_t len) const
650
* For UTF-8, we can infer the byte length of a character from the
651
* first byte of the sequence. If the given length is at least the
652
* inferred byte length, we have a complete character.
654
return (len >= utf8_ptr::s_charsize(*p));
658
size_t map(char **output_ptr, size_t *output_buf_len,
659
const char *input_ptr, size_t input_len) const
664
* do the full mapping, discarding the partial last character byte
667
return map2(output_ptr, output_buf_len, input_ptr, input_len,
671
/* map a string, providing partial character info */
672
virtual size_t map2(char **output_ptr, size_t *output_buf_len,
673
const char *input_ptr, size_t input_len,
674
size_t *partial_len) const;
677
/* we don't need a mapping table - ignore any that is set */
678
virtual void set_mapping(wchar_t, wchar_t) { }
681
/* ------------------------------------------------------------------------ */
683
* Character mapper base class for UCS-2 to UTF-8. We will subclass
684
* this mapper for big-endian and little-endian UCS-2 representations,
685
* but both mappers are essentially the same in that only format
686
* translation is required, since UCS-2 and UTF-8 use the same code
687
* point mapping (i.e., Unicode).
689
class CCharmapToUniUcs2: public CCharmapToUni
692
/* read from a file */
693
virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
694
unsigned long read_limit);
696
/* determine if a byte sequence forms a complete character */
697
virtual int is_complete_char(const char *, size_t len) const
699
/* every character in UCS-2 requires two bytes */
703
/* map a string, providing partial character info */
704
virtual size_t map2(char **output_ptr, size_t *output_buf_len,
705
const char *input_ptr, size_t input_len,
706
size_t *partial_len) const
709
* if the input length is odd, there's one byte of partial
710
* character information at the end of the buffer; otherwise
711
* everything is valid
713
*partial_len = (input_len & 1);
715
/* perform the usual mapping */
716
return map(output_ptr, output_buf_len, input_ptr, input_len);
721
* there's no mapping table for UCS-2 translations, so we don't need
722
* to do anything with mappings
724
virtual void set_mapping(wchar_t, wchar_t) { }
726
/* temporary buffer for reading files */
730
/* ------------------------------------------------------------------------ */
732
* Character mapper for UCS-2 little-endian to UTF-8
734
class CCharmapToUniUcs2Little: public CCharmapToUniUcs2
738
size_t map(char **output_ptr, size_t *output_buf_len,
739
const char *input_ptr, size_t input_len) const;
742
/* ------------------------------------------------------------------------ */
744
* Character mapper for UCS-2 big-endian to UTF-8
746
class CCharmapToUniUcs2Big: public CCharmapToUniUcs2
750
size_t map(char **output_ptr, size_t *output_buf_len,
751
const char *input_ptr, size_t input_len) const;
754
/* ------------------------------------------------------------------------ */
756
* Basic character mapper for single-byte character sets to UTF-8
758
class CCharmapToUniSB_basic: public CCharmapToUni
761
/* read from a single-byte input file, translating to UTF-8 */
762
virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
763
unsigned long read_limit);
765
/* determine if a byte sequence forms a complete character */
766
virtual int is_complete_char(const char *, size_t) const
769
* every character in a single-byte set requires just one byte;
770
* since 'len' is required to be at least one, there's no way we
771
* can't have a complete character
776
/* map a string, providing partial character info */
777
virtual size_t map2(char **output_ptr, size_t *output_buf_len,
778
const char *input_ptr, size_t input_len,
779
size_t *partial_len) const
782
* for all single-byte character sets, one byte == one character,
783
* so it's impossible to have partial characters
787
/* perform the normal mapping */
788
return map(output_ptr, output_buf_len, input_ptr, input_len);
792
/* temporary buffer for reading files */
796
/* ------------------------------------------------------------------------ */
798
* Character mapper for plain ASCII to UTF-8
800
class CCharmapToUniASCII: public CCharmapToUniSB_basic
804
size_t map(char **output_ptr, size_t *output_buf_len,
805
const char *input_ptr, size_t input_len) const;
809
* there's no map for the ASCII translation, so we can ignore
812
void set_mapping(wchar_t, wchar_t) { }
815
/* ------------------------------------------------------------------------ */
817
* Character mapper for single-byte character sets to UTF-8.
819
class CCharmapToUniSB: public CCharmapToUniSB_basic
826
/* initialize the mapping table to all U+FFFD */
827
for (i = 0 ; i < 256 ; ++i)
832
size_t map(char **output_ptr, size_t *output_buf_len,
833
const char *input_ptr, size_t input_len) const;
837
void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt)
840
* set the mapping, ignoring characters outside of our 8-bit
843
if (((unsigned int)local_code_pt) < 256)
844
map_[local_code_pt] = uni_code_pt;
849
* our mapping table - since the source character set is
850
* single-byte, we need only store a wchar_t for each of the
851
* possible 256 source characters
856
/* ------------------------------------------------------------------------ */
858
* Character mapper for mixed multi-byte character sets to UTF-8. This
859
* maps from local character sets that use a mixture of one-byte and
860
* two-byte sequences to represent characters.
864
* Primary-byte mapping table entry. This gives us mapping instructions
865
* for each leading byte of a character sequence.
867
* Each character is represented by a one-byte or two-byte sequence. This
868
* mapper assumes a context-free mapping, hence for each character
869
* represented by a single byte, that single byte unambiguously indicates
870
* that character, and hence is never the first byte of a two-byte
871
* sequence. For each character represented by a two-byte sequence, the
872
* first byte of the sequence can only be part of two-byte sequences, hence
873
* whenever we see that first byte we'll know for sure we have a two-byte
876
* Each mapping here is for a first byte. If the byte is a single-byte
877
* character, then the 'sub' pointer is null and the 'ch' entry gives the
878
* Unicode code point for the character. If the byte is the lead byte of
879
* one or more two-byte characters, then the 'sub' pointer is non-null and
885
* The sub-mapping table. This is a pointer to a table of the Unicode
886
* code points of the two-byte sequences that start with this byte.
887
* Each entry in the array is a Unicode code point, and the array is
888
* indexed by the second byte of the two-byte sequence. If this
889
* pointer is null, then this lead byte is a single-byte character.
891
* Note that this pointer, if non-null, always points to a 256-element
892
* array. This array can thus be indexed directly with any unsigned
893
* 8-bit byte value without any range checking.
898
* The Unicode code point of this character, if this primary byte is a
899
* one-byte character.
906
* The multi-byte-to-UTF8 mapper
908
class CCharmapToUniMB: public CCharmapToUni
913
/* delete the table */
914
virtual ~CCharmapToUniMB();
916
/* determine if a byte sequence forms a complete character */
917
virtual int is_complete_char(const char *p, size_t len) const
920
* Check the first byte to see if this is a leading byte or a
921
* stand-alone single byte.
923
if (map_[(unsigned char)*p].sub == 0)
926
* it's a stand-alone byte, so the character length is one;
927
* 'len' is required to be at least 1, so we definitely have a
934
/* it's a lead byte, so the character length is two */
939
/* read from a multi-byte input file, translating to UTF-8 */
940
virtual size_t read_file(osfildef *fp, char *buf, size_t bufl,
941
unsigned long read_limit);
944
size_t map(char **output_ptr, size_t *output_buf_len,
945
const char *input_ptr, size_t input_len) const
950
* do the full mapping, discarding the partial last character byte
953
return map2(output_ptr, output_buf_len, input_ptr, input_len,
957
/* map a string, providing partial character info */
958
virtual size_t map2(char **output_ptr, size_t *output_buf_len,
959
const char *input_ptr, size_t input_len,
960
size_t *partial_len) const;
964
void set_mapping(wchar_t uni_code_pt, wchar_t local_code_pt);
967
/* the primary-byte mapping table */
968
cmap_mb_entry map_[256];
970
/* temporary buffer for reading files */
974
/* ------------------------------------------------------------------------ */
976
* Character mapper for double-byte character sets to UTF-8. This maps
977
* from local character sets that use a two-byte sequence to represent
978
* each local character.
980
* For now, this is a trivial subclass of the multi-byte mapper; that
981
* mapper handles the more general case of varying-length local
982
* characters, so it can easily handle the case where every where is
983
* represented by two bytes. If there is sufficient demand for it, a
984
* special-case subclass to handle double-byte character sets specifically
985
* could provide efficiency gains, since it wouldn't have to check each
986
* lead byte to determine the character sequence length.
988
class CCharmapToUniDB: public CCharmapToUniMB
993
/* ------------------------------------------------------------------------ */
995
* Character mapper for plain ISO-8859-1 to UTF-8
997
class CCharmapToUni8859_1: public CCharmapToUniSB
1001
CCharmapToUni8859_1()
1006
* Initialize our mapping table. Each 8859-1 code point maps to
1007
* the same code point in Unicode, so this is a trivial
1010
for (i = 0 ; i < 256 ; ++i)
1015
/* ======================================================================== */
1017
* Unicode UTF-8 - to - local character set mappers
1020
/* ------------------------------------------------------------------------ */
1022
* Trivial character mapper for UTF8-to-UTF8 conversions. This can be
1023
* used when writing external data in UTF8 format; since this is the
1024
* same format we use internally, no conversion is required.
1026
class CCharmapToLocalUTF8: public CCharmapToLocal
1030
virtual size_t map_utf8(char *dest, size_t dest_len,
1031
utf8_ptr src, size_t src_byte_len,
1032
size_t *src_bytes_used) const;
1034
/* map a null-terminated string */
1035
virtual size_t map_utf8z(char *dest, size_t dest_len,
1036
utf8_ptr src) const;
1038
/* map a character */
1039
size_t map(wchar_t unicode_char, char **output_ptr,
1040
size_t *output_len) const;
1043
* determine if the given Unicode character has a mapping to the local
1046
virtual int is_mappable(wchar_t unicode_char) const
1048
/* every character can be mapped UTF8-to-UTF8, obviously */
1053
/* ------------------------------------------------------------------------ */
1055
* Character mapper for single-byte character sets. Each character in
1056
* the local (output) character set is represented by a single byte.
1058
class CCharmapToLocalSB: public CCharmapToLocal
1062
virtual size_t map_utf8(char *dest, size_t dest_len,
1063
utf8_ptr src, size_t src_byte_len,
1064
size_t *src_bytes_used) const;
1066
/* map a null-terminated string */
1067
virtual size_t map_utf8z(char *dest, size_t dest_len,
1068
utf8_ptr src) const;
1070
/* map a character */
1071
size_t map(wchar_t unicode_char, char **output_ptr,
1072
size_t *output_len) const;
1076
/* ------------------------------------------------------------------------ */
1078
* Mixed multi-byte mapper. Each local character is represented by a
1079
* sequence of one or more bytes.
1081
* This class is a trivial subclass of CCharmapToLocalSB. The single-byte
1082
* base class already does everything we need to do, because it is designed
1083
* to cope with mappings that involve expansions that represent a single
1084
* Unicode character with a sequence of local characters (for example,
1085
* "(c)" for the copyright symbol).
1087
class CCharmapToLocalMB: public CCharmapToLocalSB
1093
* Double-byte mapper. Each local character is represented by exactly two
1094
* bytes. This class is a trivial subclass of CCharmapToLocalMB, because
1095
* the multi-byte mapper already handles the more general case of local
1096
* character representations that use varying byte lengths; there is no
1097
* particular efficiency gain to be had by creating a separate special-case
1098
* class for double-byte character sets.
1100
class CCharmapToLocalDB: public CCharmapToLocalMB
1106
/* ------------------------------------------------------------------------ */
1108
* Character mapper for mapping to local default 7-bit ASCII. This
1109
* mapper is has a built-in character set translation so that we can
1110
* always create one without having to find an external mapping file.
1112
class CCharmapToLocalASCII: public CCharmapToLocalSB
1115
CCharmapToLocalASCII();
1120
* Character mapper for mapping to local ISO-8859-1. This mapper has a
1121
* built-in character set translation so that we can always create one
1122
* even without an external mapping file.
1124
class CCharmapToLocal8859_1: public CCharmapToLocalSB
1127
CCharmapToLocal8859_1();
1130
/* ------------------------------------------------------------------------ */
1132
* Character mapper for 16-bit Wide Unicode local character set. Stores
1133
* characters in the correct local wchar_t representation. Assumes that
1134
* the pointer is wchar_t-aligned.
1136
* This is a trival translation. Because we're mapping from Unicode to
1137
* Unicode, the only thing we're changing is the encoding format - the
1138
* character code is simply copied without any translation, since
1139
* Unicode is the same everywhere.
1141
class CCharmapToLocalWideUnicode: public CCharmapToLocal
1145
virtual size_t map_utf8(char *dest, size_t dest_len,
1146
utf8_ptr src, size_t src_byte_len,
1147
size_t *src_bytes_used) const;
1149
/* map a null-terminated string */
1150
virtual size_t map_utf8z(char *dest, size_t dest_len,
1151
utf8_ptr src) const;
1153
/* map a character */
1154
size_t map(wchar_t unicode_char, char **output_ptr,
1155
size_t *output_len) const;
1158
* determine if the given Unicode character has a mapping to the local
1161
virtual int is_mappable(wchar_t unicode_char) const
1163
/* every character can be mapped UTF8-to-UCS2 */
1168
/* ------------------------------------------------------------------------ */
1170
* Character mapper for 16-bit Wide Unicode, big-endian. Stores the
1171
* characters in big-endian UCS-2 representation.
1173
class CCharmapToLocalUcs2Big: public CCharmapToLocal
1177
virtual size_t map_utf8(char *dest, size_t dest_len,
1178
utf8_ptr src, size_t src_byte_len,
1179
size_t *src_bytes_used) const;
1181
/* map a null-terminated string */
1182
virtual size_t map_utf8z(char *dest, size_t dest_len,
1183
utf8_ptr src) const;
1185
/* map a character */
1186
size_t map(wchar_t unicode_char, char **output_ptr,
1187
size_t *output_len) const;
1190
/* ------------------------------------------------------------------------ */
1192
* Character mapper for 16-bit Wide Unicode, little-endian. Stores the
1193
* characters in little-endian UCS-2 representation.
1195
class CCharmapToLocalUcs2Little: public CCharmapToLocal
1199
virtual size_t map_utf8(char *dest, size_t dest_len,
1200
utf8_ptr src, size_t src_byte_len,
1201
size_t *src_bytes_used) const;
1203
/* map a null-terminated string */
1204
virtual size_t map_utf8z(char *dest, size_t dest_len,
1205
utf8_ptr src) const;
1207
/* map a character */
1208
size_t map(wchar_t unicode_char, char **output_ptr,
1209
size_t *output_len) const;
1213
#endif /* CHARMAP_H */