12
12
** This file contains routines used to translate between UTF-8,
13
13
** UTF-16, UTF-16BE, and UTF-16LE.
15
** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $
15
** $Id: utf.c,v 1.53 2007/08/07 17:04:59 drh Exp $
49
49
** This lookup table is used to help decode the first byte of
50
50
** a multi-byte UTF8 character.
52
const unsigned char sqlite3UtfTrans1[] = {
52
static const unsigned char sqlite3UtfTrans1[] = {
53
53
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
54
54
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
55
55
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
131
** Translate a single UTF-8 character. Return the unicode value.
133
** During translation, assume that the byte that zTerm points
136
** Write a pointer to the next unread byte back into *pzNext.
138
** Notes On Invalid UTF-8:
140
** * This routine never allows a 7-bit character (0x00 through 0x7f) to
141
** be encoded as a multi-byte character. Any multi-byte character that
142
** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
144
** * This routine never allows a UTF16 surrogate value to be encoded.
145
** If a multi-byte character attempts to encode a value between
146
** 0xd800 and 0xe000 then it is rendered as 0xfffd.
148
** * Bytes in the range of 0x80 through 0xbf which occur as the first
149
** byte of a character are interpreted as single-byte characters
150
** and rendered as themselves even though they are technically
151
** invalid characters.
153
** * This routine accepts an infinite number of different UTF8 encodings
154
** for unicode values 0x80 and greater. It do not change over-length
155
** encodings to 0xfffd as some systems recommend.
158
const unsigned char *z, /* First byte of UTF-8 character */
159
const unsigned char *zTerm, /* Pretend this byte is 0x00 */
160
const unsigned char **pzNext /* Write first byte past UTF-8 char here */
164
c = sqlite3UtfTrans1[c-0xc0];
165
while( z!=zTerm && (*z & 0xc0)==0x80 ){
166
c = (c<<6) + (0x3f & *(z++));
169
|| (c&0xFFFFF800)==0xD800
170
|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }
130
179
** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
131
180
** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
221
270
if( pMem->enc==SQLITE_UTF8 ){
222
unsigned int iExtra = 0xD800;
224
if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){
225
/* This UTF8 string is not nul-terminated, and the last byte is
226
** not a character in the ascii range (codpoints 0..127). This
227
** means the SQLITE_READ_UTF8() macro might read past the end
228
** of the allocated buffer.
230
** There are four possibilities:
232
** 1. The last byte is the first byte of a non-ASCII character,
234
** 2. The final N bytes of the input string are continuation bytes
235
** and immediately preceding them is the first byte of a
236
** non-ASCII character.
238
** 3. The final N bytes of the input string are continuation bytes
239
** and immediately preceding them is a byte that encodes a
240
** character in the ASCII range.
242
** 4. The entire string consists of continuation characters.
244
** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8()
245
** macro will not overread the buffer in these cases.
247
unsigned char *zExtra = &zTerm[-1];
248
while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){
252
if( (zExtra[0]&0xC0)==0xC0 ){
253
/* Make a copy of the last character encoding in the input string.
254
** Then make sure it is nul-terminated and use SQLITE_READ_UTF8()
255
** to decode the codepoint. Store the codepoint in variable iExtra,
256
** it will be appended to the output string later.
258
unsigned char *zFree = 0;
259
unsigned char zBuf[16];
260
int nExtra = (pMem->n+zIn-zExtra);
263
zExtra = sqliteMallocRaw(nExtra+1);
271
memcpy(zExtra, zTerm, nExtra);
272
zExtra[nExtra] = '\0';
273
SQLITE_READ_UTF8(zExtra, iExtra);
278
271
if( desiredEnc==SQLITE_UTF16LE ){
279
272
/* UTF-8 -> UTF-16 Little-endian */
280
273
while( zIn<zTerm ){
281
SQLITE_READ_UTF8(zIn, c);
274
c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
282
275
WRITE_UTF16LE(z, c);
284
if( iExtra!=0xD800 ){
285
WRITE_UTF16LE(z, iExtra);
288
278
assert( desiredEnc==SQLITE_UTF16BE );
289
279
/* UTF-8 -> UTF-16 Big-endian */
290
280
while( zIn<zTerm ){
291
SQLITE_READ_UTF8(zIn, c);
281
c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
292
282
WRITE_UTF16BE(z, c);
294
if( iExtra!=0xD800 ){
295
WRITE_UTF16BE(z, iExtra);
298
285
pMem->n = z - zOut;
477
464
int sqlite3Utf8To8(unsigned char *zIn){
478
465
unsigned char *zOut = zIn;
479
466
unsigned char *zStart = zIn;
467
unsigned char *zTerm;
483
SQLITE_READ_UTF8(zIn, c);
471
c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
486
473
WRITE_UTF8(zOut, c);