~ubuntu-branches/ubuntu/hardy/sqlite3/hardy

« back to all changes in this revision

Viewing changes to src/utf.c

Committer: Bazaar Package Importer
Author(s): Laszlo Boszormenyi (GCS)
Date: 2007-08-20 16:12:00 UTC
mfrom: (1.1.11 upstream)
Revision ID: james.westby@ubuntu.com-20070820161200-1u06zme8ghkyaenn

Tags: 3.4.2-1

New upstream release.

files added:
mkextu.sh

mkextw.sh

src/dump.txt

src/func.c.try1

src/pager.c.bu1

src/patch1

src/t1

src/t2

test/fts1m.test

test/fts1n.test

test/fts1o.test

test/mallocC.test

test/softheap1.test

files removed:
last_change

files modified:
VERSION

debian/changelog

debian/pkgIndex.tcl

ext/fts1/fts1.c

ext/fts1/fts1_porter.c

ext/fts1/fts1_tokenizer1.c

ext/fts2/fts2.c

ext/fts2/fts2_porter.c

ext/fts2/fts2_tokenizer1.c

main.mk

sqlite3.pc

src/btree.c

src/expr.c

src/func.c

src/insert.c

src/main.c

src/malloc.c

src/os_common.h

src/os_unix.c

src/pager.c

src/prepare.c

src/shell.c

src/sqlite.h.in

src/sqliteInt.h

src/tclsqlite.c

src/test1.c

src/test_config.c

src/test_md5.c

src/tokenize.c

src/trigger.c

src/utf.c

src/vdbe.c

src/vdbeapi.c

src/vdbeaux.c

src/where.c

test/attach2.test

test/avtrans.test

test/cache.test

test/capi3.test

test/capi3b.test

test/cast.test

test/check.test

test/collate6.test

test/exclusive.test

test/exclusive2.test

test/fts2k.test

test/incrblob.test

test/incrvacuum.test

test/lock2.test

test/malloc5.test

test/mallocB.test

test/misc4.test

test/pageropt.test

test/schema.test

test/select1.test

test/shared.test

test/tester.tcl

tool/lemon.c

tool/mkkeywordhash.c

tool/mksqlite3c.tcl

www/changes.tcl

www/faq.tcl

www/index.tcl

www/lang.tcl

www/limits.tcl

www/oldnews.tcl

www/pragma.tcl

Show diffs side-by-side

added added

removed removed

src/utf.c

** This file contains routines used to translate between UTF-8,

** UTF-16, UTF-16BE, and UTF-16LE.

** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $

** $Id: utf.c,v 1.53 2007/08/07 17:04:59 drh Exp $

** Notes on UTF-8:

** This lookup table is used to help decode the first byte of

** a multi-byte UTF8 character.

const unsigned char sqlite3UtfTrans1[] = {

static const unsigned char sqlite3UtfTrans1[] = {

0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,

0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,

0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,

};

#define WRITE_UTF8(zOut, c) { \

if( c<0x00080 ){ \

*zOut++ = (c&0xFF); \

127

128

}

128

129

130

131

** Translate a single UTF-8 character. Return the unicode value.

132

133

** During translation, assume that the byte that zTerm points

134

** is a 0x00.

135

136

** Write a pointer to the next unread byte back into *pzNext.

137

138

** Notes On Invalid UTF-8:

139

140

** * This routine never allows a 7-bit character (0x00 through 0x7f) to

141

** be encoded as a multi-byte character. Any multi-byte character that

142

** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.

143

144

** * This routine never allows a UTF16 surrogate value to be encoded.

145

** If a multi-byte character attempts to encode a value between

146

** 0xd800 and 0xe000 then it is rendered as 0xfffd.

147

148

** * Bytes in the range of 0x80 through 0xbf which occur as the first

149

** byte of a character are interpreted as single-byte characters

150

** and rendered as themselves even though they are technically

151

** invalid characters.

152

153

** * This routine accepts an infinite number of different UTF8 encodings

154

** for unicode values 0x80 and greater. It do not change over-length

155

** encodings to 0xfffd as some systems recommend.

156

157

int sqlite3Utf8Read(

158

const unsigned char *z, /* First byte of UTF-8 character */

159

const unsigned char *zTerm, /* Pretend this byte is 0x00 */

160

const unsigned char **pzNext /* Write first byte past UTF-8 char here */

161

){

162

int c = *(z++);

163

if( c>=0xc0 ){

164

c = sqlite3UtfTrans1[c-0xc0];

165

while( z!=zTerm && (*z & 0xc0)==0x80 ){

166

c = (c<<6) + (0x3f & *(z++));

167

}

168

if( c<0x80

169

|| (c&0xFFFFF800)==0xD800

170

|| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; }

171

}

172

*pzNext = z;

173

return c;

174

}

175

176

177

178

130

179

** If the TRANSLATE_TRACE macro is defined, the value of each Mem is

131

180

** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().

132

181

219

268

z = zOut;

220

269

221

270

if( pMem->enc==SQLITE_UTF8 ){

222

unsigned int iExtra = 0xD800;

223

224

if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){

225

/* This UTF8 string is not nul-terminated, and the last byte is

226

** not a character in the ascii range (codpoints 0..127). This

227

** means the SQLITE_READ_UTF8() macro might read past the end

228

** of the allocated buffer.

229

230

** There are four possibilities:

231

232

** 1. The last byte is the first byte of a non-ASCII character,

233

234

** 2. The final N bytes of the input string are continuation bytes

235

** and immediately preceding them is the first byte of a

236

** non-ASCII character.

237

238

** 3. The final N bytes of the input string are continuation bytes

239

** and immediately preceding them is a byte that encodes a

240

** character in the ASCII range.

241

242

** 4. The entire string consists of continuation characters.

243

244

** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8()

245

** macro will not overread the buffer in these cases.

246

247

unsigned char *zExtra = &zTerm[-1];

248

while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){

249

zExtra--;

250

}

251

252

if( (zExtra[0]&0xC0)==0xC0 ){

253

/* Make a copy of the last character encoding in the input string.

254

** Then make sure it is nul-terminated and use SQLITE_READ_UTF8()

255

** to decode the codepoint. Store the codepoint in variable iExtra,

256

** it will be appended to the output string later.

257

258

unsigned char *zFree = 0;

259

unsigned char zBuf[16];

260

int nExtra = (pMem->n+zIn-zExtra);

261

zTerm = zExtra;

262

if( nExtra>15 ){

263

zExtra = sqliteMallocRaw(nExtra+1);

264

if( !zExtra ){

265

return SQLITE_NOMEM;

266

}

267

zFree = zExtra;

268

}else{

269

zExtra = zBuf;

270

}

271

memcpy(zExtra, zTerm, nExtra);

272

zExtra[nExtra] = '\0';

273

SQLITE_READ_UTF8(zExtra, iExtra);

274

sqliteFree(zFree);

275

}

276

}

277

278

271

if( desiredEnc==SQLITE_UTF16LE ){

279

272

/* UTF-8 -> UTF-16 Little-endian */

280

273

while( zIn<zTerm ){

281

SQLITE_READ_UTF8(zIn, c);

274

c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);

282

275

WRITE_UTF16LE(z, c);

283

276

}

284

if( iExtra!=0xD800 ){

285

WRITE_UTF16LE(z, iExtra);

286

}

287

277

}else{

288

278

assert( desiredEnc==SQLITE_UTF16BE );

289

279

/* UTF-8 -> UTF-16 Big-endian */

290

280

while( zIn<zTerm ){

291

SQLITE_READ_UTF8(zIn, c);

281

c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);

292

282

WRITE_UTF16BE(z, c);

293

283

}

294

if( iExtra!=0xD800 ){

295

WRITE_UTF16BE(z, iExtra);

296

}

297

284

}

298

285

pMem->n = z - zOut;

299

286

*z++ = 0;

477

464

int sqlite3Utf8To8(unsigned char *zIn){

478

465

unsigned char *zOut = zIn;

479

466

unsigned char *zStart = zIn;

480

int c;

467

unsigned char *zTerm;

468

u32 c;

481

469

482

while(1){

483

SQLITE_READ_UTF8(zIn, c);

484

if( c==0 ) break;

470

while( zIn[0] ){

471

c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);

485

472

if( c!=0xfffd ){

486

473

WRITE_UTF8(zOut, c);

487

474

}

501

488

unsigned int i, t;

502

489

unsigned char zBuf[20];

503

490

unsigned char *z;

491

unsigned char *zTerm;

504

492

int n;

505

493

unsigned int c;

506

494

509

497

WRITE_UTF8(z, i);

510

498

n = z-zBuf;

511

499

z[0] = 0;

500

zTerm = z;

512

501

z = zBuf;

513

SQLITE_READ_UTF8(z, c);

502

c = sqlite3Utf8Read(z, zTerm, (const u8**)&z);

514

503

t = i;

515

504

if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;

516

505

if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;

Older »