1
/*-------------------------------------------------------------------------
3
* Utility functions for conversion procs.
5
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
6
* Portions Copyright (c) 1994, Regents of the University of California
11
*-------------------------------------------------------------------------
14
#include "mb/pg_wchar.h"
18
* LATINn ---> MIC when the charset's local codes map directly to MIC
20
* l points to the source string of length len
21
* p is the output area (must be large enough!)
22
* lc is the mule character set id for the local encoding
23
* encoding is the PG identifier for the local encoding
26
latin2mic(const unsigned char *l, unsigned char *p, int len,
35
report_invalid_encoding(encoding, (const char *) l, len);
36
if (IS_HIGHBIT_SET(c1))
46
* MIC ---> LATINn when the charset's local codes map directly to MIC
48
* mic points to the source string of length len
49
* p is the output area (must be large enough!)
50
* lc is the mule character set id for the local encoding
51
* encoding is the PG identifier for the local encoding
54
mic2latin(const unsigned char *mic, unsigned char *p, int len,
63
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
64
if (!IS_HIGHBIT_SET(c1))
73
int l = pg_mic_mblen(mic);
76
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
78
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
79
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
80
(const char *) mic, len);
93
* While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
94
* characters, here we must take a hard line because we don't know
95
* the appropriate MIC equivalent.
98
pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
105
if (c1 == 0 || IS_HIGHBIT_SET(c1))
106
report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
118
pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
125
if (c1 == 0 || IS_HIGHBIT_SET(c1))
126
report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
127
(const char *) mic, len);
136
* latin2mic_with_table: a generic single byte charset encoding
137
* conversion from a local charset to the mule internal code.
139
* l points to the source string of length len
140
* p is the output area (must be large enough!)
141
* lc is the mule character set id for the local encoding
142
* encoding is the PG identifier for the local encoding
143
* tab holds conversion entries for the local charset
144
* starting from 128 (0x80). each entry in the table
145
* holds the corresponding code point for the mule internal code.
148
latin2mic_with_table(const unsigned char *l,
153
const unsigned char *tab)
162
report_invalid_encoding(encoding, (const char *) l, len);
163
if (!IS_HIGHBIT_SET(c1))
167
c2 = tab[c1 - HIGHBIT];
174
report_untranslatable_char(encoding, PG_MULE_INTERNAL,
175
(const char *) l, len);
184
* mic2latin_with_table: a generic single byte charset encoding
185
* conversion from the mule internal code to a local charset.
187
* mic points to the source string of length len
188
* p is the output area (must be large enough!)
189
* lc is the mule character set id for the local encoding
190
* encoding is the PG identifier for the local encoding
191
* tab holds conversion entries for the mule internal code's
192
* second byte, starting from 128 (0x80). each entry in the table
193
* holds the corresponding code point for the local charset.
196
mic2latin_with_table(const unsigned char *mic,
201
const unsigned char *tab)
210
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
211
if (!IS_HIGHBIT_SET(c1))
220
int l = pg_mic_mblen(mic);
223
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
225
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
226
(c2 = tab[mic[1] - HIGHBIT]) == 0)
228
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
229
(const char *) mic, len);
230
break; /* keep compiler quiet */
241
* comparison routine for bsearch()
242
* this routine is intended for UTF8 -> local code
245
compare1(const void *p1, const void *p2)
251
v2 = ((pg_utf_to_local *) p2)->utf;
252
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
256
* comparison routine for bsearch()
257
* this routine is intended for local code -> UTF8
260
compare2(const void *p1, const void *p2)
266
v2 = ((pg_local_to_utf *) p2)->code;
267
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
271
* comparison routine for bsearch()
272
* this routine is intended for combined UTF8 -> local code
275
compare3(const void *p1, const void *p2)
283
s2 = *((uint32 *) p1 + 1);
284
d1 = ((pg_utf_to_local_combined *) p2)->utf1;
285
d2 = ((pg_utf_to_local_combined *) p2)->utf2;
286
return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
290
* comparison routine for bsearch()
291
* this routine is intended for local code -> combined UTF8
294
compare4(const void *p1, const void *p2)
300
v2 = ((pg_local_to_utf_combined *) p2)->code;
301
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
305
* convert 32bit wide character to mutibye stream pointed to by iso
307
static unsigned char *
308
set_iso_code(unsigned char *iso, uint32 code)
310
if (code & 0xff000000)
312
if (code & 0x00ff0000)
313
*iso++ = (code & 0x00ff0000) >> 16;
314
if (code & 0x0000ff00)
315
*iso++ = (code & 0x0000ff00) >> 8;
316
if (code & 0x000000ff)
317
*iso++ = code & 0x000000ff;
322
* UTF8 ---> local code
324
* utf: input UTF8 string (need not be null-terminated).
325
* iso: pointer to the output area (must be large enough!)
326
* map: the conversion map.
327
* cmap: the conversion map for combined characters.
329
* size1: the size of the conversion map.
330
* size2: the size of the conversion map for combined characters
332
* encoding: the PG identifier for the local encoding.
333
* len: length of input string.
336
UtfToLocal(const unsigned char *utf, unsigned char *iso,
337
const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
338
int size1, int size2, int encoding, int len)
344
pg_utf_to_local_combined *cp;
347
for (; len > 0; len -= l)
349
/* "break" cases all represent errors */
353
l = pg_utf_mblen(utf);
358
if (!pg_utf8_islegal(utf, l))
363
/* ASCII case is easy */
381
iutf |= *utf++ << 16;
387
* first, try with combined map if possible
391
const unsigned char *utf_save = utf;
397
l = pg_utf_mblen(utf);
401
if (!pg_utf8_islegal(utf, l))
410
p = bsearch(&cutf[0], map, size1,
411
sizeof(pg_utf_to_local), compare1);
413
report_untranslatable_char(PG_UTF8, encoding,
414
(const char *) (utf_save - l_save), len_save);
415
iso = set_iso_code(iso, p->code);
418
/* ASCII case is easy */
436
iutf |= *utf++ << 16;
442
cp = bsearch(cutf, cmap, size2,
443
sizeof(pg_utf_to_local_combined), compare3);
448
/* not found in combined map. try with ordinary map */
449
p = bsearch(&cutf[0], map, size1,
450
sizeof(pg_utf_to_local), compare1);
452
report_untranslatable_char(PG_UTF8, encoding,
453
(const char *) (utf_save - l_save), len_save);
454
iso = set_iso_code(iso, p->code);
456
p = bsearch(&cutf[1], map, size1,
457
sizeof(pg_utf_to_local), compare1);
459
report_untranslatable_char(PG_UTF8, encoding,
460
(const char *) (utf - l), len);
464
else /* no cmap or no remaining data */
466
p = bsearch(&iutf, map, size1,
467
sizeof(pg_utf_to_local), compare1);
469
report_untranslatable_char(PG_UTF8, encoding,
470
(const char *) (utf - l), len);
473
iso = set_iso_code(iso, code);
477
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
483
* local code ---> UTF8
485
* iso: input local string (need not be null-terminated).
486
* utf: pointer to the output area (must be large enough!)
487
* map: the conversion map.
488
* cmap: the conversion map for combined characters.
490
* size1: the size of the conversion map.
491
* size2: the size of the conversion map for combined characters
493
* encoding: the PG identifier for the local encoding.
494
* len: length of input string.
497
LocalToUtf(const unsigned char *iso, unsigned char *utf,
498
const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
499
int size1, int size2, int encoding, int len)
504
pg_local_to_utf_combined *cp;
506
if (!PG_VALID_ENCODING(encoding))
508
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
509
errmsg("invalid encoding number: %d", encoding)));
511
for (; len > 0; len -= l)
513
/* "break" cases all represent errors */
517
if (!IS_HIGHBIT_SET(*iso))
519
/* ASCII case is easy */
525
l = pg_encoding_verifymb(encoding, (const char *) iso, len);
545
iiso |= *iso++ << 16;
550
p = bsearch(&iiso, map, size1,
551
sizeof(pg_local_to_utf), compare2);
556
* not found in the ordinary map. if there's a combined character
561
cp = bsearch(&iiso, cmap, size2,
562
sizeof(pg_local_to_utf_combined), compare4);
566
if (cp->utf1 & 0xff000000)
567
*utf++ = cp->utf1 >> 24;
568
if (cp->utf1 & 0x00ff0000)
569
*utf++ = (cp->utf1 & 0x00ff0000) >> 16;
570
if (cp->utf1 & 0x0000ff00)
571
*utf++ = (cp->utf1 & 0x0000ff00) >> 8;
572
if (cp->utf1 & 0x000000ff)
573
*utf++ = cp->utf1 & 0x000000ff;
575
if (cp->utf2 & 0xff000000)
576
*utf++ = cp->utf2 >> 24;
577
if (cp->utf2 & 0x00ff0000)
578
*utf++ = (cp->utf2 & 0x00ff0000) >> 16;
579
if (cp->utf2 & 0x0000ff00)
580
*utf++ = (cp->utf2 & 0x0000ff00) >> 8;
581
if (cp->utf2 & 0x000000ff)
582
*utf++ = cp->utf2 & 0x000000ff;
588
report_untranslatable_char(encoding, PG_UTF8,
589
(const char *) (iso - l), len);
594
if (p->utf & 0xff000000)
595
*utf++ = p->utf >> 24;
596
if (p->utf & 0x00ff0000)
597
*utf++ = (p->utf & 0x00ff0000) >> 16;
598
if (p->utf & 0x0000ff00)
599
*utf++ = (p->utf & 0x0000ff00) >> 8;
600
if (p->utf & 0x000000ff)
601
*utf++ = p->utf & 0x000000ff;
606
report_invalid_encoding(encoding, (const char *) iso, len);