4
** The author disclaims copyright to this source code. In place of
5
** a legal notice, here is a blessing:
7
** May you do good and not evil.
8
** May you find forgiveness for yourself and forgive others.
9
** May you share freely, never taking more than you give.
11
*************************************************************************
12
** This file contains routines used to translate between UTF-8,
13
** UTF-16, UTF-16BE, and UTF-16LE.
15
** $Id: utf.c,v 1.32 2005/01/28 01:29:08 drh Exp $
19
** Byte-0 Byte-1 Byte-2 Byte-3 Value
20
** 0xxxxxxx 00000000 00000000 0xxxxxxx
21
** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx
22
** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx
23
** 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 000uuuuu zzzzyyyy yyxxxxxx
26
** Notes on UTF-16: (with wwww+1==uuuuu)
28
** Word-0 Word-1 Value
29
** 110110ww wwzzzzyy 110111yy yyxxxxxx 000uuuuu zzzzyyyy yyxxxxxx
30
** zzzzyyyy yyxxxxxx 00000000 zzzzyyyy yyxxxxxx
33
** BOM or Byte Order Mark:
34
** 0xff 0xfe little-endian utf-16 follows
35
** 0xfe 0xff big-endian utf-16 follows
38
** Handling of malformed strings:
40
** SQLite accepts and processes malformed strings without an error wherever
41
** possible. However this is not possible when converting between UTF-8 and
44
** When converting malformed UTF-8 strings to UTF-16, one instance of the
45
** replacement character U+FFFD for each byte that cannot be interpeted as
46
** part of a valid unicode character.
48
** When converting malformed UTF-16 strings to UTF-8, one instance of the
49
** replacement character U+FFFD for each pair of bytes that cannot be
50
** interpeted as part of a valid unicode character.
52
** This file contains the following public routines:
54
** sqlite3VdbeMemTranslate() - Translate the encoding used by a Mem* string.
55
** sqlite3VdbeMemHandleBom() - Handle byte-order-marks in UTF16 Mem* strings.
56
** sqlite3utf16ByteLen() - Calculate byte-length of a void* UTF16 string.
57
** sqlite3utf8CharLen() - Calculate char-length of a char* UTF8 string.
58
** sqlite3utf8LikeCompare() - Do a LIKE match given two UTF8 char* strings.
61
#include "sqliteInt.h"
66
** This table maps from the first byte of a UTF-8 character to the number
67
** of trailing bytes expected. A value '255' indicates that the table key
68
** is not a legal first byte for a UTF-8 character.
70
static const u8 xtra_utf8_bytes[256] = {
72
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
77
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
83
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
84
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
85
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
88
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
95
3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255,
99
** This table maps from the number of trailing bytes in a UTF-8 character
100
** to an integer constant that is effectively calculated for each character
101
** read by a naive implementation of a UTF-8 character reader. The code
102
** in the READ_UTF8 macro explains things best.
104
static const int xtra_utf8_bits[4] = {
106
12416, /* (0xC0 << 6) + (0x80) */
107
925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
108
63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
111
#define READ_UTF8(zIn, c) { \
114
xtra = xtra_utf8_bytes[c]; \
116
case 255: c = (int)0xFFFD; break; \
117
case 3: c = (c<<6) + *(zIn)++; \
118
case 2: c = (c<<6) + *(zIn)++; \
119
case 1: c = (c<<6) + *(zIn)++; \
120
c -= xtra_utf8_bits[xtra]; \
123
int sqlite3ReadUtf8(const unsigned char *z){
129
#define SKIP_UTF8(zIn) { \
130
zIn += (xtra_utf8_bytes[*(u8 *)zIn] + 1); \
133
#define WRITE_UTF8(zOut, c) { \
135
*zOut++ = (c&0xFF); \
137
else if( c<0x00800 ){ \
138
*zOut++ = 0xC0 + ((c>>6)&0x1F); \
139
*zOut++ = 0x80 + (c & 0x3F); \
141
else if( c<0x10000 ){ \
142
*zOut++ = 0xE0 + ((c>>12)&0x0F); \
143
*zOut++ = 0x80 + ((c>>6) & 0x3F); \
144
*zOut++ = 0x80 + (c & 0x3F); \
146
*zOut++ = 0xF0 + ((c>>18) & 0x07); \
147
*zOut++ = 0x80 + ((c>>12) & 0x3F); \
148
*zOut++ = 0x80 + ((c>>6) & 0x3F); \
149
*zOut++ = 0x80 + (c & 0x3F); \
153
#define WRITE_UTF16LE(zOut, c) { \
155
*zOut++ = (c&0x00FF); \
156
*zOut++ = ((c>>8)&0x00FF); \
158
*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
159
*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
160
*zOut++ = (c&0x00FF); \
161
*zOut++ = (0x00DC + ((c>>8)&0x03)); \
165
#define WRITE_UTF16BE(zOut, c) { \
167
*zOut++ = ((c>>8)&0x00FF); \
168
*zOut++ = (c&0x00FF); \
170
*zOut++ = (0x00D8 + (((c-0x10000)>>18)&0x03)); \
171
*zOut++ = (((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \
172
*zOut++ = (0x00DC + ((c>>8)&0x03)); \
173
*zOut++ = (c&0x00FF); \
177
#define READ_UTF16LE(zIn, c){ \
179
c += ((*zIn++)<<8); \
180
if( c>=0xD800 && c<=0xE000 ){ \
182
c2 += ((*zIn++)<<8); \
183
c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
187
#define READ_UTF16BE(zIn, c){ \
190
if( c>=0xD800 && c<=0xE000 ){ \
191
int c2 = ((*zIn++)<<8); \
193
c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \
197
#define SKIP_UTF16BE(zIn){ \
198
if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \
204
#define SKIP_UTF16LE(zIn){ \
206
if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn-1)==0x00)) ){ \
213
#define RSKIP_UTF16LE(zIn){ \
214
if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn-1)==0x00)) ){ \
220
#define RSKIP_UTF16BE(zIn){ \
222
if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \
230
** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
231
** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
233
/* #define TRANSLATE_TRACE 1 */
235
#ifndef SQLITE_OMIT_UTF16
237
** This routine transforms the internal text encoding used by pMem to
238
** desiredEnc. It is an error if the string is already of the desired
239
** encoding, or if *pMem does not contain a string value.
241
int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
242
unsigned char zShort[NBFS]; /* Temporary short output buffer */
243
int len; /* Maximum length of output string in bytes */
244
unsigned char *zOut; /* Output buffer */
245
unsigned char *zIn; /* Input iterator */
246
unsigned char *zTerm; /* End of input */
247
unsigned char *z; /* Output iterator */
250
assert( pMem->flags&MEM_Str );
251
assert( pMem->enc!=desiredEnc );
252
assert( pMem->enc!=0 );
253
assert( pMem->n>=0 );
255
#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
258
sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
259
fprintf(stderr, "INPUT: %s\n", zBuf);
263
/* If the translation is between UTF-16 little and big endian, then
264
** all that is required is to swap the byte order. This case is handled
265
** differently from the others.
267
if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
270
rc = sqlite3VdbeMemMakeWriteable(pMem);
272
assert( rc==SQLITE_NOMEM );
276
zTerm = &zIn[pMem->n];
283
pMem->enc = desiredEnc;
287
/* Set len to the maximum number of bytes required in the output buffer. */
288
if( desiredEnc==SQLITE_UTF8 ){
289
/* When converting from UTF-16, the maximum growth results from
290
** translating a 2-byte character to a 3-byte UTF-8 character (i.e.
291
** code-point 0xFFFC). A single byte is required for the output string
294
len = (pMem->n/2) * 3 + 1;
296
/* When converting from UTF-8 to UTF-16 the maximum growth is caused
297
** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
298
** character. Two bytes are required in the output buffer for the
301
len = pMem->n * 2 + 2;
304
/* Set zIn to point at the start of the input buffer and zTerm to point 1
305
** byte past the end.
307
** Variable zOut is set to point at the output buffer. This may be space
308
** obtained from malloc(), or Mem.zShort, if it large enough and not in
309
** use, or the zShort array on the stack (see above).
312
zTerm = &zIn[pMem->n];
314
zOut = sqliteMallocRaw(len);
315
if( !zOut ) return SQLITE_NOMEM;
321
if( pMem->enc==SQLITE_UTF8 ){
322
if( desiredEnc==SQLITE_UTF16LE ){
323
/* UTF-8 -> UTF-16 Little-endian */
329
assert( desiredEnc==SQLITE_UTF16BE );
330
/* UTF-8 -> UTF-16 Big-endian */
339
assert( desiredEnc==SQLITE_UTF8 );
340
if( pMem->enc==SQLITE_UTF16LE ){
341
/* UTF-16 Little-endian -> UTF-8 */
343
READ_UTF16LE(zIn, c);
347
/* UTF-16 Little-endian -> UTF-8 */
349
READ_UTF16BE(zIn, c);
356
assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
358
sqlite3VdbeMemRelease(pMem);
359
pMem->flags &= ~(MEM_Static|MEM_Dyn|MEM_Ephem|MEM_Short);
360
pMem->enc = desiredEnc;
362
memcpy(pMem->zShort, zOut, len);
364
pMem->flags |= (MEM_Term|MEM_Short);
366
pMem->flags |= (MEM_Term|MEM_Dyn);
371
#if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
374
sqlite3VdbeMemPrettyPrint(pMem, zBuf, 100);
375
fprintf(stderr, "OUTPUT: %s\n", zBuf);
382
** This routine checks for a byte-order mark at the beginning of the
383
** UTF-16 string stored in *pMem. If one is present, it is removed and
384
** the encoding of the Mem adjusted. This routine does not do any
385
** byte-swapping, it just sets Mem.enc appropriately.
387
** The allocation (static, dynamic etc.) and encoding of the Mem may be
388
** changed by this function.
390
int sqlite3VdbeMemHandleBom(Mem *pMem){
394
if( pMem->n<0 || pMem->n>1 ){
395
u8 b1 = *(u8 *)pMem->z;
396
u8 b2 = *(((u8 *)pMem->z) + 1);
397
if( b1==0xFE && b2==0xFF ){
398
bom = SQLITE_UTF16BE;
400
if( b1==0xFF && b2==0xFE ){
401
bom = SQLITE_UTF16LE;
406
/* This function is called as soon as a string is stored in a Mem*,
407
** from within sqlite3VdbeMemSetStr(). At that point it is not possible
408
** for the string to be stored in Mem.zShort, or for it to be stored
409
** in dynamic memory with no destructor.
411
assert( !(pMem->flags&MEM_Short) );
412
assert( !(pMem->flags&MEM_Dyn) || pMem->xDel );
413
if( pMem->flags & MEM_Dyn ){
414
void (*xDel)(void*) = pMem->xDel;
418
rc = sqlite3VdbeMemSetStr(pMem, &z[2], pMem->n-2, bom, SQLITE_TRANSIENT);
421
rc = sqlite3VdbeMemSetStr(pMem, &pMem->z[2], pMem->n-2, bom,
427
#endif /* SQLITE_OMIT_UTF16 */
430
** pZ is a UTF-8 encoded unicode string. If nByte is less than zero,
431
** return the number of unicode characters in pZ up to (but not including)
432
** the first 0x00 byte. If nByte is not less than zero, return the
433
** number of unicode characters in the first nByte of pZ (or up to
434
** the first 0x00, whichever comes first).
436
int sqlite3utf8CharLen(const char *z, int nByte){
442
zTerm = (const char *)(-1);
445
while( *z!=0 && z<zTerm ){
452
#ifndef SQLITE_OMIT_UTF16
454
** pZ is a UTF-16 encoded unicode string. If nChar is less than zero,
455
** return the number of bytes up to (but not including), the first pair
456
** of consecutive 0x00 bytes in pZ. If nChar is not less than zero,
457
** then return the number of bytes in the first nChar unicode characters
458
** in pZ (or up until the first pair of 0x00 bytes, whichever comes first).
460
int sqlite3utf16ByteLen(const void *zIn, int nChar){
464
if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
465
while( c && ((nChar<0) || n<nChar) ){
470
while( c && ((nChar<0) || n<nChar) ){
475
return (z-(char const *)zIn)-((c==0)?2:0);
479
** UTF-16 implementation of the substr()
481
void sqlite3utf16Substr(
482
sqlite3_context *context,
487
unsigned char const *zStr;
488
unsigned char const *zStrEnd;
489
unsigned char const *zStart;
490
unsigned char const *zEnd;
493
zStr = (unsigned char const *)sqlite3_value_text16(argv[0]);
494
zStrEnd = &zStr[sqlite3_value_bytes16(argv[0])];
495
y = sqlite3_value_int(argv[1]);
496
z = sqlite3_value_int(argv[2]);
501
if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
502
for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16BE(zStart);
504
for(i=0; i<y && zStart<zStrEnd; i++) SKIP_UTF16LE(zStart);
508
if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
509
for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16BE(zStart);
511
for(i=y; i<0 && zStart>zStr; i++) RSKIP_UTF16LE(zStart);
513
for(; i<0; i++) z -= 1;
517
if( SQLITE_UTF16BE==SQLITE_UTF16NATIVE ){
518
for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16BE(zEnd);
520
for(i=0; i<z && zEnd<zStrEnd; i++) SKIP_UTF16LE(zEnd);
523
sqlite3_result_text16(context, zStart, zEnd-zStart, SQLITE_TRANSIENT);
526
#if defined(SQLITE_TEST)
528
** This routine is called from the TCL test function "translate_selftest".
529
** It checks that the primitives for serializing and deserializing
530
** characters in each encoding are inverses of each other.
532
void sqlite3utfSelfTest(){
534
unsigned char zBuf[20];
539
for(i=0; i<0x00110000; i++){
546
assert( (z-zBuf)==n );
548
for(i=0; i<0x00110000; i++){
549
if( i>=0xD800 && i<=0xE000 ) continue;
556
assert( (z-zBuf)==n );
558
for(i=0; i<0x00110000; i++){
559
if( i>=0xD800 && i<=0xE000 ) continue;
566
assert( (z-zBuf)==n );
569
#endif /* SQLITE_TEST */
570
#endif /* SQLITE_OMIT_UTF16 */