~hikiko/nux/arb-srgba-shader

ConversionResult ConvertUTF32toUTF16 (const t_UTF32** sourceStart, const t_UTF32* sourceEnd, t_UTF16** targetStart, t_UTF16* targetEnd, ConversionFlags flags)

{

ConversionResult result = conversionOK;

const t_UTF32* source = *sourceStart;

t_UTF16* target = *targetStart;

while (source < sourceEnd) {

t_UTF32 ch;

if (target >= targetEnd) {

result = targetExhausted; break;

}

ch = *source++;

if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */

/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */

if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {

if (flags == strictConversion) {

--source; /* return to the illegal value itself */

result = sourceIllegal;

break;

} else {

*target++ = UNI_REPLACEMENT_CHAR;

}

} else {

*target++ = (t_UTF16)ch; /* normal case */

}

} else if (ch > UNI_MAX_LEGAL_UTF32) {

if (flags == strictConversion) {

result = sourceIllegal;

} else {

*target++ = UNI_REPLACEMENT_CHAR;

}

} else {

/* target is a character in range 0xFFFF - 0x10FFFF. */

if (target + 1 >= targetEnd) {

--source; /* Back up source pointer! */

result = targetExhausted; break;

}

ch -= halfBase;

*target++ = (t_UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);

*target++ = (t_UTF16)((ch & halfMask) + UNI_SUR_LOW_START);

}

*sourceStart = source;

100

*targetStart = target;

101

return result;

102

}

103

104

/* --------------------------------------------------------------------- */

105

106

ConversionResult ConvertUTF16toUTF32 (const t_UTF16** sourceStart, const t_UTF16* sourceEnd, t_UTF32** targetStart, t_UTF32* targetEnd, ConversionFlags flags)

107

{

108

ConversionResult result = conversionOK;

109

const t_UTF16* source = *sourceStart;

110

t_UTF32* target = *targetStart;

111

t_UTF32 ch, ch2;

112

while (source < sourceEnd) {

113

const t_UTF16* oldSource = source; /* In case we have to back up because of target overflow. */

114

ch = *source++;

115

/* If we have a surrogate pair, convert to UTF32 first. */

116

if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {

117

/* If the 16 bits following the high surrogate are in the source buffer... */

118

if (source < sourceEnd) {

119

ch2 = *source;

120

/* If it's a low surrogate, convert to UTF32. */

121

if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {

122

ch = ((ch - UNI_SUR_HIGH_START) << halfShift)

123

+ (ch2 - UNI_SUR_LOW_START) + halfBase;

124

++source;

125

} else if (flags == strictConversion) { /* it's an unpaired high surrogate */

126

--source; /* return to the illegal value itself */

127

result = sourceIllegal;

128

break;

129

}

130

} else { /* We don't have the 16 bits following the high surrogate. */

131

--source; /* return to the high surrogate */

132

result = sourceExhausted;

133

break;

134

}

135

} else if (flags == strictConversion) {

136

/* UTF-16 surrogate values are illegal in UTF-32 */

137

if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {

138

--source; /* return to the illegal value itself */

139

result = sourceIllegal;

140

break;

141

}

142

}

143

if (target >= targetEnd) {

144

source = oldSource; /* Back up source pointer! */

145

result = targetExhausted; break;

146

}

147

*target++ = ch;

148

}

149

*sourceStart = source;

150

*targetStart = target;

151

#ifdef CVTUTF_DEBUG

152

if (result == sourceIllegal)

153

{

154

fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);

155

fflush(stderr);

156

}

157

#endif

158

return result;

159

}

160

161

/* --------------------------------------------------------------------- */

162

163

164

* Index into the table below with the first byte of a UTF-8 sequence to

165

* get the number of trailing bytes that are supposed to follow it.

166

* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is

167

* left as-is for anyone who may want to do such conversion, which was

168

* allowed in earlier algorithms.

169

170

static const char trailingBytesForUTF8[256] =

171

{

172

0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

173

0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

174

0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

175

0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

176

0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

177

0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

178

1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

179

2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5

180

};

181

182

183

* Magic values subtracted from a buffer value during UTF8 conversion.

184

* This table contains as many values as there might be trailing bytes

185

* in a UTF-8 sequence.

186

187

static const t_UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,

188

0x03C82080UL, 0xFA082080UL, 0x82082080UL };

189

190

191

* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed

192

* into the first byte, depending on how many bytes follow. There are

193

* as many entries in this table as there are UTF-8 sequence types.

194

* (I.e., one byte sequence, two byte... etc.). Remember that sequencs

195

* for *legal* UTF-8 will be 4 or fewer bytes total.

196

197

static const t_UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

198

199

/* --------------------------------------------------------------------- */

200

201

/* The interface converts a whole buffer to avoid function-call overhead.

202

* Constants have been gathered. Loops & conditionals have been removed as

203

* much as possible for efficiency, in favor of drop-through switches.

204

* (See "Note A" at the bottom of the file for equivalent code.)

205

* If your compiler supports it, the "isLegalUTF8" call can be turned

206

* into an inline function.

207

208

209

/* --------------------------------------------------------------------- */

210

211

ConversionResult ConvertUTF16toUTF8 (const t_UTF16** sourceStart, const t_UTF16* sourceEnd, t_UTF8** targetStart, t_UTF8* targetEnd, ConversionFlags flags)

212

{

213

ConversionResult result = conversionOK;

214

const t_UTF16* source = *sourceStart;

215

t_UTF8* target = *targetStart;

216

while (source < sourceEnd) {

217

t_UTF32 ch;

218

unsigned short bytesToWrite = 0;

219

const t_UTF32 byteMask = 0xBF;

220

const t_UTF32 byteMark = 0x80;

221

const t_UTF16* oldSource = source; /* In case we have to back up because of target overflow. */

222

ch = *source++;

223

/* If we have a surrogate pair, convert to UTF32 first. */

224

if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {

225

/* If the 16 bits following the high surrogate are in the source buffer... */

226

if (source < sourceEnd) {

227

t_UTF32 ch2 = *source;

228

/* If it's a low surrogate, convert to UTF32. */

229

if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {

230

ch = ((ch - UNI_SUR_HIGH_START) << halfShift)

231

+ (ch2 - UNI_SUR_LOW_START) + halfBase;

232

++source;

233

} else if (flags == strictConversion) { /* it's an unpaired high surrogate */

234

--source; /* return to the illegal value itself */

235

result = sourceIllegal;

236

break;

237

}

238

} else { /* We don't have the 16 bits following the high surrogate. */

239

--source; /* return to the high surrogate */

240

result = sourceExhausted;

241

break;

242

}

243

} else if (flags == strictConversion) {

244

/* UTF-16 surrogate values are illegal in UTF-32 */

245

if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {

246

--source; /* return to the illegal value itself */

247

result = sourceIllegal;

248

break;

249

}

250

}

251

/* Figure out how many bytes the result will require */

252

if (ch < (t_UTF32)0x80) { bytesToWrite = 1;

253

} else if (ch < (t_UTF32)0x800) { bytesToWrite = 2;

254

} else if (ch < (t_UTF32)0x10000) { bytesToWrite = 3;

255

} else if (ch < (t_UTF32)0x110000) { bytesToWrite = 4;

256

} else { bytesToWrite = 3;

257

ch = UNI_REPLACEMENT_CHAR;

258

}

259

260

target += bytesToWrite;

261

if (target > targetEnd) {

262

source = oldSource; /* Back up source pointer! */

263

target -= bytesToWrite; result = targetExhausted; break;

264

}

265

switch (bytesToWrite) { /* note: everything falls through. */

266

case 4: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;

267

case 3: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;

268

case 2: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;

269

case 1: *--target = (t_UTF8)(ch | firstByteMark[bytesToWrite]);

270

}

271

target += bytesToWrite;

272

}

273

*sourceStart = source;

274

*targetStart = target;

275

return result;

276

}

277

278

/* --------------------------------------------------------------------- */

279

280

281

* Utility routine to tell whether a sequence of bytes is legal UTF-8.

282

* This must be called with the length pre-determined by the first byte.

283

* If not calling this from ConvertUTF8to*, then the length can be set by:

284

* length = trailingBytesForUTF8[*source]+1;

285

* and the sequence is illegal right away if there aren't that many bytes

286

* available.

287

* If presented with a length > 4, this returns false. The Unicode

288

* definition of UTF-8 goes up to 4-byte sequences.

289

290

291

static bool isLegalUTF8(const t_UTF8 *source, int length)

292

{

293

t_UTF8 a;

294

const t_UTF8 *srcptr = source+length;

295

switch (length) {

296

default: return false;

297

/* Everything else falls through when "true"... */

298

case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;

299

case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;

300

case 2: if ((a = (*--srcptr)) > 0xBF) return false;

301

302

switch (*source) {

303

/* no fall-through in this inner switch */

304

case 0xE0: if (a < 0xA0) return false; break;

305

case 0xED: if ((a < 0x80) || (a > 0x9F)) return false; break;

306

case 0xF0: if (a < 0x90) return false; break;

307

case 0xF4: if (a > 0x8F) return false; break;

308

default: if (a < 0x80) return false;

309

}

310

311

case 1: if (*source >= 0x80 && *source < 0xC2) return false;

312

}

313

if (*source > 0xF4) return false;

314

return true;

315

}

316

317

/* --------------------------------------------------------------------- */

318

319

320

* Exported function to return whether a UTF-8 sequence is legal or not.

321

* This is not used here; it's just exported.

322

323

324

bool isLegalUTF8Sequence(const t_UTF8 *source, const t_UTF8 *sourceEnd)

325

{

326

int length;

327

if (source == sourceEnd) {

328

return true;

329

}

330

while (true) {

331

length = trailingBytesForUTF8[*source]+1;

332

if (source+length > sourceEnd) {

333

return false;

334

}

335

if (!isLegalUTF8(source, length)) {

336

return false;

337

}

338

source += length;

339

if (source >= sourceEnd) {

340

return true;

341

}

342

}

343

}

344

345

/**

346

* This is a variation of isLegalUTF8Sequence() that behaves like g_utf8_validate().

347

* In addition to knowing if the sequence is legal, it also tells you the last good character.

348

349

bool

350

tr_utf8_validate( const char * str, int max_len, const char ** end )

351

{

352

const t_UTF8* source = (const t_UTF8*) str;

353

const t_UTF8* sourceEnd;

354

355

if( max_len == 0 )

356

return true;

357

358

if( str == NULL )

359

return false;

360

361

sourceEnd = source + ((max_len < 0) ? strlen(str) : (size_t)max_len);

362

363

if( source == sourceEnd )

364

{

365

if( end != NULL )

366

*end = (const char*) source;

367

return true;

368

}

369

370

for( ;; )

371

{

372

const int length = trailingBytesForUTF8[*source] + 1;

373

if (source + length > sourceEnd) {

374

if( end != NULL )

375

*end = (const char*) source;

376

return false;

377

}

378

if (!isLegalUTF8(source, length)) {

379

if( end != NULL )

380

*end = (const char*) source;

381

return false;

382

}

383

source += length;

384

if (source >= sourceEnd) {

385

if( end != NULL )

386

*end = (const char*) source;

387

return true;

388

}

389

}

390

391

392

}

393

394

395

/* --------------------------------------------------------------------- */

396

397

ConversionResult ConvertUTF8toUTF16 (const t_UTF8** sourceStart, const t_UTF8* sourceEnd, t_UTF16** targetStart, t_UTF16* targetEnd, ConversionFlags flags)

398

{

399

ConversionResult result = conversionOK;

400

const t_UTF8* source = *sourceStart;

401

t_UTF16* target = *targetStart;

402

while (source < sourceEnd) {

403

t_UTF32 ch = 0;

404

unsigned short extraBytesToRead = trailingBytesForUTF8[*source];

405

if (source + extraBytesToRead >= sourceEnd) {

406

result = sourceExhausted; break;

407

}

408

/* Do this check whether lenient or strict */

409

if (! isLegalUTF8(source, extraBytesToRead+1)) {

410

result = sourceIllegal;

411

break;

412

}

413

414

* The cases all fall through. See "Note A" below.

415

416

switch (extraBytesToRead) {

417

case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */

418

case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */

419

case 3: ch += *source++; ch <<= 6;

420

case 2: ch += *source++; ch <<= 6;

421

case 1: ch += *source++; ch <<= 6;

422

case 0: ch += *source++;

423

}

424

ch -= offsetsFromUTF8[extraBytesToRead];

425

426

if (target >= targetEnd) {

427

source -= (extraBytesToRead+1); /* Back up source pointer! */

428

result = targetExhausted; break;

429

}

430

if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */

431

/* UTF-16 surrogate values are illegal in UTF-32 */

432

if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {

433

if (flags == strictConversion) {

434

source -= (extraBytesToRead+1); /* return to the illegal value itself */

435

result = sourceIllegal;

436

break;

437

} else {

438

*target++ = UNI_REPLACEMENT_CHAR;

439

}

440

} else {

441

*target++ = (t_UTF16)ch; /* normal case */

442

}

443

} else if (ch > UNI_MAX_UTF16) {

444

if (flags == strictConversion) {

445

result = sourceIllegal;

446

source -= (extraBytesToRead+1); /* return to the start */

447

break; /* Bail out; shouldn't continue */

448

} else {

449

*target++ = UNI_REPLACEMENT_CHAR;

450

}

451

} else {

452

/* target is a character in range 0xFFFF - 0x10FFFF. */

453

if (target + 1 >= targetEnd) {

454

source -= (extraBytesToRead+1); /* Back up source pointer! */

455

result = targetExhausted; break;

456

}

457

ch -= halfBase;

458

*target++ = (t_UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);

459

*target++ = (t_UTF16)((ch & halfMask) + UNI_SUR_LOW_START);

460

}

461

}

462

*sourceStart = source;

463

*targetStart = target;

464

return result;

465

}

466

467

/* --------------------------------------------------------------------- */

468

469

ConversionResult ConvertUTF32toUTF8 (

470

const t_UTF32** sourceStart, const t_UTF32* sourceEnd,

471

t_UTF8** targetStart, t_UTF8* targetEnd, ConversionFlags flags)

472

{

473

ConversionResult result = conversionOK;

474

const t_UTF32* source = *sourceStart;

475

t_UTF8* target = *targetStart;

476

while (source < sourceEnd) {

477

t_UTF32 ch;

478

unsigned short bytesToWrite = 0;

479

const t_UTF32 byteMask = 0xBF;

480

const t_UTF32 byteMark = 0x80;

481

ch = *source++;

482

if (flags == strictConversion ) {

483

/* UTF-16 surrogate values are illegal in UTF-32 */

484

if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {

485

--source; /* return to the illegal value itself */

486

result = sourceIllegal;

487

break;

488

}

489

}

490

491

* Figure out how many bytes the result will require. Turn any

492

* illegally large UTF32 things (> Plane 17) into replacement chars.

493

494

if (ch < (t_UTF32)0x80) { bytesToWrite = 1;

495

} else if (ch < (t_UTF32)0x800) { bytesToWrite = 2;

496

} else if (ch < (t_UTF32)0x10000) { bytesToWrite = 3;

497

} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;

498

} else { bytesToWrite = 3;

499

ch = UNI_REPLACEMENT_CHAR;

500

result = sourceIllegal;

501

}

502

503

target += bytesToWrite;

504

if (target > targetEnd) {

505

--source; /* Back up source pointer! */

506

target -= bytesToWrite; result = targetExhausted; break;

507

}

508

switch (bytesToWrite) { /* note: everything falls through. */

509

case 4: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;

510

case 3: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;

511

case 2: *--target = (t_UTF8)((ch | byteMark) & byteMask); ch >>= 6;

512

case 1: *--target = (t_UTF8) (ch | firstByteMark[bytesToWrite]);

513

}

514

target += bytesToWrite;

515

}

516

*sourceStart = source;

517

*targetStart = target;

518

return result;

519

}

520

521

/* --------------------------------------------------------------------- */

522

523

ConversionResult ConvertUTF8toUTF32 (

524

const t_UTF8** sourceStart, const t_UTF8* sourceEnd,

525

t_UTF32** targetStart, t_UTF32* targetEnd, ConversionFlags flags)

526

{

527

ConversionResult result = conversionOK;

528

const t_UTF8* source = *sourceStart;

529

t_UTF32* target = *targetStart;

530

while (source < sourceEnd) {

531

t_UTF32 ch = 0;

532

unsigned short extraBytesToRead = trailingBytesForUTF8[*source];

533

if (source + extraBytesToRead >= sourceEnd) {

534

result = sourceExhausted; break;

535

}

536

/* Do this check whether lenient or strict */

537

if (! isLegalUTF8(source, extraBytesToRead+1)) {

538

result = sourceIllegal;

539

break;

540

}

541

542

* The cases all fall through. See "Note A" below.

543

544

switch (extraBytesToRead) {

545

case 5: ch += *source++; ch <<= 6;

546

case 4: ch += *source++; ch <<= 6;

547

case 3: ch += *source++; ch <<= 6;

548

case 2: ch += *source++; ch <<= 6;

549

case 1: ch += *source++; ch <<= 6;

550

case 0: ch += *source++;

551

}

552

ch -= offsetsFromUTF8[extraBytesToRead];

553

554

if (target >= targetEnd) {

555

source -= (extraBytesToRead+1); /* Back up the source pointer! */

556

result = targetExhausted; break;

557

}

558

if (ch <= UNI_MAX_LEGAL_UTF32) {

559

560

* UTF-16 surrogate values are illegal in UTF-32, and anything

561

* over Plane 17 (> 0x10FFFF) is illegal.

562

563

if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {

564

if (flags == strictConversion) {

565

source -= (extraBytesToRead+1); /* return to the illegal value itself */

566

result = sourceIllegal;

567

break;

568

} else {

569

*target++ = UNI_REPLACEMENT_CHAR;

570

}

571

} else {

572

*target++ = ch;

573

}

574

} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */

575

result = sourceIllegal;

576

*target++ = UNI_REPLACEMENT_CHAR;

577

}

578

}

579

*sourceStart = source;

580

*targetStart = target;

581

return result;

582

}

583

584

/* ---------------------------------------------------------------------

585

586

Note A.

587

The fall-through switches in UTF-8 reading code save a

588

temp variable, some decrements & conditionals. The switches

589

are equivalent to the following loop:

590

{

591

int tmpBytesToRead = extraBytesToRead+1;

592

do {

593

ch += *source++;

594

--tmpBytesToRead;

595

if (tmpBytesToRead) ch <<= 6;

596

} while (tmpBytesToRead > 0);

597

}

598

In UTF-8 writing code, the switches on "bytesToWrite" are

599

similarly unrolled loops.

600

601

--------------------------------------------------------------------- */

602

603

NAMESPACE_END

Older »