1
/* $Header: d:/cvsroot/tads/tads3/utf8.h,v 1.2 1999/05/17 02:52:29 MJRoberts Exp $ */
4
* Copyright (c) 1998, 2002 Michael J. Roberts. All Rights Reserved.
6
* Please see the accompanying license file, LICENSE.TXT, for information
7
* on using and copying this software.
11
utf8.h - UTF-8 character string manipulation
17
10/16/98 MJRoberts - Creation
25
/* ------------------------------------------------------------------------ */
27
* UTF-8 character string pointer.
29
* Note that this class deviates from the normal naming convention where
30
* each class begins with a capital 'C'. Since this class is so
31
* low-level, and is used so much like the (char *) type, it seems more
32
* proper to give it a name as though it were a typedef for a native type.
34
* If ever there was a time when operator overloading is indicated, this
35
* would be it. We could overload increment and decrement operators, for
36
* example, to step through the string. However, I just plain don't like
37
* operator overloading, so I do not use it here. Instead, we use
38
* explicit method names to avoid obfuscating the code as overloaded
39
* operators would. It's a trade-off: it's less concise this way, but
42
* Note the important distinction between "byte" and "character": a byte
43
* is the basic multi-bit unit of native storage, and a character
44
* represents the basic lexical unit; a character may be composed of more
51
/* create a UTF-8 string pointer, with no initial underlying string */
52
utf8_ptr() { p_ = 0; }
55
* Create a UTF-8 string pointer with an underlying string. The
56
* pointer must point to the first byte of a valid character.
58
utf8_ptr(char *str) { set(str); }
61
* Set the pointer to a new underlying buffer. The pointer must
62
* point to the first byte of a valid character if there are already
63
* characters in the buffer.
65
void set(char *str) { p_ = str; }
68
* Get the character at the current position
70
wchar_t getch() const { return s_getch(p_); }
73
* Get the character at a given character offset from the current
74
* position. The offset must be positive.
76
wchar_t getch_at(size_t ofs) const { return s_getch_at(p_, ofs); }
79
* Get the character preceding the current character by the given
80
* amount. The offset must be positive. getch_before(1) returns
81
* the character preceding the current character, (2) returns the
82
* character two positions before the current character, and so on.
84
wchar_t getch_before(size_t ofs) const { return s_getch_before(p_,ofs); }
87
* Encode a character into the buffer at the current position, and
88
* increment the pointer past the character.
90
void setch(wchar_t ch)
92
/* store the character and advance the buffer pointer */
93
p_ += s_putch(p_, ch);
96
/* call setch() for each character in a null-terminated string */
97
void setch_str(const char *str)
99
for ( ; *str != '\0' ; ++str)
100
p_ += s_putch(p_, *str);
104
* Encode a string of wide characters into the buffer. We'll
105
* increment our pointer so that it points to the next available
106
* character when we're done. Returns the number of bytes used for
109
* 'src_count' is the number of wide characters in the source string.
111
* 'bufsiz' gives the size remaining in the underlying buffer. If
112
* we run out of space, we won't encode any more characters, but we
113
* will still return the total number of bytes required to encode
116
size_t setwchars(const wchar_t *src, size_t src_count, size_t bufsiz);
119
* Encode a null-terminated string of wide-characters into our
120
* buffer. Works like setwchars(), but stops at the null terminator
121
* in the source rather than taking a character count.
123
* This routine includes the null terminator in the resulting UTF-8
124
* string, and includes the space it takes in the result length, BUT
125
* we leave our pointer pointing to the null terminator.
127
size_t setwcharsz(const wchar_t *src, size_t bufsiz);
129
/* increment the pointer by one character */
130
void inc() { p_ = s_inc(p_); }
133
* increment the pointer by one character, and decrement a remaining
134
* length counter accordingly
136
void inc(size_t *rem)
140
/* calculate the increment amount */
143
/* decrement the length counter by the change */
146
/* save the new pointer value */
150
/* decrement the pointer by one character */
151
void dec() { p_ = s_dec(p_); }
153
/* decrement poniter and increment the remaining size to compensate */
154
void dec(size_t *rem)
158
/* calculate the decrement amount */
161
/* decrement the length counter by the change */
164
/* save the new pointer value */
169
* Determine if the current character is a continuation character.
170
* Returns 1 if so, 0 if not.
172
int is_continuation() const { return s_is_continuation(p_); }
175
* count the number of characters in the given number of bytes,
176
* starting at the current byte
178
size_t len(size_t bytecnt) const
184
/* calculate the ending point */
188
/* increment until we run out of bytes */
189
for (cnt = 0 ; p < end ; p = s_inc(p), ++cnt) ;
191
/* return the result */
195
/* get the byte size of the current character */
196
size_t charsize() const { return s_charsize(*p_); }
199
* Get the number of bytes in the given number of characters
200
* starting at the current position.
202
size_t bytelen(size_t charcnt) const
206
/* skip the given number of characters */
207
for (p = p_ ; charcnt != 0 ; p = s_inc(p), --charcnt) ;
209
/* return the number of bytes we skipped */
214
* count the number of characters to the null terminator
221
/* increment until we find a null byte */
222
for (cnt = 0, p = p_ ; *p != 0 ; p = s_inc(p), ++cnt) ;
224
/* return the result */
228
/* get the current pointer position */
229
char *getptr() const { return p_; }
231
/* -------------------------------------------------------------------- */
237
* Compare two UTF-8 strings. Returns a value less than zero if the
238
* first string is lexically less than the second string (i.e., the
239
* first string sorts ahead of the second string), zero if the two
240
* strings are identical, or a value greater than zero if the first
241
* string is lexically greater than the second string.
243
static int s_compare_to(const char *p1, size_t bytelen1,
244
const char *p2, size_t bytelen2);
246
/* get the character at the given byte pointer */
247
static wchar_t s_getch(const char *p)
250
* If the high bit is 0, it's a one-byte sequence encoding the
251
* value in the low seven bits.
253
if ((*p & 0x80) == 0)
254
return (((unsigned char)*p) & 0x7f);
257
* If the high two bytes are 110, it's a two-byte sequence, with
258
* the high-order 5 bits in the low 5 bits of the first byte, and
259
* the low-order six bits in the low 6 bits of the second byte.
261
if ((*p & 0xE0) == 0xC0)
262
return (((((unsigned char)*p) & 0x1F) << 6)
263
+ (((unsigned char)*(p + 1)) & 0x3F));
266
* Otherwise, we have a three-byte sequence: the high-order 4 bits
267
* are in the low-order 5 bits of the first byte, the next 6 bits
268
* are in the low-order 6 bits of the second byte, and the
269
* low-order 6 bits are in the low-order 6 bits of the third byte.
271
return (((((unsigned char)*p) & 0x0F) << 12)
272
+ ((((unsigned char)*(p + 1)) & 0x3F) << 6)
273
+ (((unsigned char)*(p + 2)) & 0x3F));
277
* get the character at a given positive character offset from a
280
static wchar_t s_getch_at(const char *p, size_t ofs)
282
/* skip the given number of characters */
283
for ( ; ofs != 0 ; --ofs, p += s_charsize(*p)) ;
285
/* return the character at the current position */
290
* get the character preceding the current character by the given
291
* number of positions; the offset value must be positive
293
static wchar_t s_getch_before(const char *p, size_t ofs)
295
/* skip backwards the given number of characters */
296
for ( ; ofs != 0 ; --ofs)
299
* back up by one to three bytes, until we find no more
303
p -= s_is_continuation(p);
304
p -= s_is_continuation(p);
307
/* return the character at the current position */
312
* Write a given wchar_t value to the given byte pointer. The
313
* caller must already have checked (via s_wchar_size) that there's
314
* enough room in the buffer for this character's UTF-8
317
* Returns the number of bytes stored.
319
static size_t s_putch(char *p, wchar_t ch)
321
/* check the range to determine how to encode it */
325
* it's in the range 0x0000 to 0x007f - encode the low-order
328
*p = (char)(ch & 0x7f);
331
else if (ch <= 0x07ff)
334
* It's in the range 0x0080 to 0x07ff - encode it in two
335
* bytes. The high-order 5 bits go in the first byte after
336
* the two-byte prefix of 110, and the low-order 6 bits go in
337
* the second byte after the continuation prefix of 10.
339
*p++ = (char)(0xC0 | ((ch >> 6) & 0x1F));
340
*p = (char)(0x80 | (ch & 0x3F));
346
* It's in the range 0x0800 to 0xffff - encode it in three
347
* bytes. The high-order 4 bits go in the first byte after
348
* the 1110 prefix, the next 6 bits go in the second byte
349
* after the 10 continuation prefix, and the low-order 6 bits
350
* go in the third byte after another 10 continuation prefix.
352
*p++ = (char)(0xE0 | ((ch >> 12) & 0x0F));
353
*p++ = (char)(0x80 | ((ch >> 6) & 0x3F));
354
*p = (char)(0x80 | (ch & 0x3F));
359
/* increment a pointer to a buffer, returning the result */
360
static char *s_inc(char *p)
363
* increment the pointer by the size of the current character
364
* and return the result
366
return p + s_charsize(*p);
369
/* get the size of the character at the given byte pointer */
370
static size_t s_charsize(char c)
375
* Check the top three bits. If the pattern is 111xxxxx, we're
376
* pointing to a three-byte sequence. If the pattern is
377
* 110xxxxx, we're pointing to a two-byte sequence. If it's
378
* 0xxxxxxx, it's a one-byte sequence.
380
* We're being somewhat clever (tricky, anyway) here at the
381
* expense of clarity. To avoid conditionals, we're doing some
382
* tricky bit masking and shifting, since these operations are
383
* extremely fast on most machines. We figure out our increment
384
* using the bit patterns above to generate masks, then shift
385
* these around to produce 1's or 0's, then add up all of the
386
* mask calculations to get our final increment.
388
* The size is always at least 1 byte, so we start out with an
391
* Next, we note that character sizes other than 1 always
392
* require the high bit to be set. So, the rest is all ANDed
393
* with (byte & 80) shifted right by seven OR'ed to the same
394
* thing shifted right by six, which will give us a bit mask of
395
* 0 when the high bit is clear and 3 when it's set.
397
* Next, we'll pick out that third bit (xx1xxxxx or xx0xxxxx) by
398
* AND'ing with 0x20. We'll shift this right by 5, to give us 1
399
* if we have a three-byte sequence.
401
* We'll then add 1 to this, so we'll have a result of 1 for a
402
* two-byte sequence, 2 for a three-byte sequence.
404
ch = (unsigned int)(unsigned char)c;
406
((((ch & 0x80) >> 7) | ((ch & 0x80) >> 6))
407
& (1 + ((ch & 0x20) >> 5))));
411
* get the number of bytes required to encode a given wchar_t in
414
static size_t s_wchar_size(wchar_t ch)
417
* characters 0-0x7f take up one byte; characters 0x80-0x7ff
418
* take up two bytes; all others take up three bytes
420
return (ch < 0x80 ? 1 : (ch < 0x800 ? 2 : 3));
423
/* decrement a pointer by one character, returning the result */
424
static char *s_dec(char *p)
427
* Going backwards, we can't tell that we're on a start byte
428
* until we get there - there's no context to tell us which byte
429
* of a multi-byte sequence we're on, except that we can tell
430
* whether or not we're on the first byte or an extra byte. So,
431
* decrement the pointer by a byte; if we're not on a start
432
* byte, decrement by another byte; if we're still not on a
433
* start byte, decrement it again.
435
* Since the longest possible sequence is three bytes, we'll
436
* unroll the loop and simply check twice to see if we're done
440
p -= s_is_continuation(p);
441
p -= s_is_continuation(p);
443
/* return the result */
448
* Determine if the current byte is a continuation byte. Returns 1
449
* if this is a continuation byte, 0 if not.
451
static int s_is_continuation(const char *p)
456
* Continuation bytes have the pattern 10xxxxxx. Initial bytes
457
* never have this pattern. So, if a byte ANDed with 0xC0 is
458
* 0x80 (i.e., the high two bits have the exact patern '10'),
459
* we're on a continuation byte.
461
* To avoid conditionals, which can be expensive because they
462
* require branching, we'll play more bit mask tricks: we'll
463
* compute a value that's 1 when the high two bits are '10', and
464
* is zero otherwise, and then subtract that from the current
465
* pointer. To figure this value, we'll mask the byte with 0x80
466
* to pick out the high bit, and shift this right seven bits.
467
* This will give us 1 for 1xxxxxxx. Then, we'll mask the byte
468
* with 0x40, which will pick out the second bit, invert the
469
* resulting bit pattern, AND it again with 0x40, and shift it
470
* right six bits. This will give us 1 for x0xxxxxx. We'll AND
471
* this with the previous calculation, which will give us 1 for
472
* 10xxxxxx and 0 for anything else.
474
ch = (unsigned int)(unsigned char)*p;
475
return (((ch & 0x80) >> 7)
476
& (((~(ch & 0x40)) & 0x40) >> 6));
480
* Truncate a string to the given byte length, ensuring that only
481
* whole characters are included in the result. Takes the proposed
482
* truncated length, and returns the actual length to use. The
483
* returned length will be less than or equal to the proposed
484
* length; if the returned length is less than the proposed length,
485
* it means that the proposed length would have cut off a multi-byte
486
* character, so the actual length had to be shorter to ensure that
487
* no bytes of the final character were included.
489
static size_t s_trunc(const char *p, size_t len)
495
* if the length is zero, no adjustment is needed - you
496
* obviously can't divide zero bytes
502
* Get a pointer to the start of the last byte within the
503
* proposed truncated byte region. Note that the last byte in
504
* the buffer is at index (len-1), since the byte at index (len)
505
* is the next byte after the truncated region.
507
last_ch = p + len - 1;
510
* Decrement this byte pointer until we get to the start of the
511
* character that contains the final byte. Since a character
512
* can never be more than three bytes long, we need decrement
513
* our pointer a maximum of two times.
515
last_ch -= s_is_continuation(last_ch);
516
last_ch -= s_is_continuation(last_ch);
519
* figure the number of bytes of the last character that are
520
* actually in the truncated region - this is simply the number
521
* of bytes from where we are now to the end of the region
523
last_ch_len = len - (last_ch - p);
526
* Now compute the actual size of this last character. If the
527
* last character's actual size is the same as the truncated
528
* size, then the last character fits exactly and we can return
529
* the proposed length unchanged. If the last character's
530
* required length is more than the truncated length, it means
531
* that the truncation has cut off the last character so that
532
* not all of its bytes fit, and hence we cannot include ANY of
533
* the last character's bytes in the result.
535
if (last_ch_len >= s_charsize(*last_ch))
537
/* the last character fits in the truncation - we're fine */
543
* the last character doesn't fit - truncate so that none of
544
* the last character's bytes are included
546
return (last_ch - p);
551
/* the buffer pointer */