1
// ---------------------------------------------------------------------------
3
// - standard object library - unicode functions class implementation -
4
// ---------------------------------------------------------------------------
5
// - This program is free software; you can redistribute it and/or modify -
6
// - it provided that this copyright notice is kept intact. -
8
// - This program is distributed in the hope that it will be useful, but -
9
// - without any warranty; without even the implied warranty of -
10
// - merchantability or fitness for a particular purpose. In no event shall -
11
// - the copyright holder be liable for any direct, indirect, incidental or -
12
// - special damages arising in any way out of the use of this software. -
13
// ---------------------------------------------------------------------------
14
// - copyright (c) 1999-2011 amaury darsch -
15
// ---------------------------------------------------------------------------
18
#include "Unicode.hpp"
19
#include "Utility.hpp"
20
#include "Exception.hpp"
25
// -------------------------------------------------------------------------
26
// - private section -
27
// -------------------------------------------------------------------------
29
// this procedure checks if a character belong to an array
30
static inline bool is_csep (const char c, const char* sep) {
31
while (*sep != nilc) {
32
if (*sep++ == c) return true;
37
// this procedure checks if a character belong to an array
38
static inline bool is_qsep (const t_quad c, const t_quad* sep) {
39
while (*sep != nilq) {
40
if (*sep++ == c) return true;
45
// this procedure checks if a buffer is a valid utf8 encoding
46
static bool is_utf_08 (const t_byte src[], const long size) {
47
// check trivial size first
48
if ((size <= 0) || (size >= Unicode::MAX_UTF8_SIZE)) return false;
50
if ((src[0] < 0x80) && (size == 1)) return true;
52
if ((src[0] < 0xE0) && (size == 2)) {
53
if ((src[1] & 0x80) != 0x80) return false;
57
if ((src[0] < 0xF0) && (size == 3)) {
58
if ((src[1] & 0x80) != 0x80) return false;
59
if ((src[2] & 0x80) != 0x80) return false;
63
if ((src[0] < 0xF8) && (size == 4)) {
64
if ((src[1] & 0x80) != 0x80) return false;
65
if ((src[2] & 0x80) != 0x80) return false;
66
if ((src[3] & 0x80) != 0x80) return false;
70
if ((src[0] < 0xFC) && (size == 5)) {
71
if ((src[1] & 0x80) != 0x80) return false;
72
if ((src[2] & 0x80) != 0x80) return false;
73
if ((src[3] & 0x80) != 0x80) return false;
74
if ((src[4] & 0x80) != 0x80) return false;
78
if ((src[0] < 0xFE) && (size == 6)) {
79
if ((src[1] & 0x80) != 0x80) return false;
80
if ((src[2] & 0x80) != 0x80) return false;
81
if ((src[3] & 0x80) != 0x80) return false;
82
if ((src[4] & 0x80) != 0x80) return false;
83
if ((src[5] & 0x80) != 0x80) return false;
90
// this procedure converts a utf8 buffer into a unicode quad
91
static bool utf_08_toq (t_quad& dst, const t_byte src[]) {
92
// initialize destination
103
dst = ((t_quad) (b0 & 0x3F)) << 6;
105
if ((b1 & 0x80) != 0x80) {
109
dst |= (t_quad) (b1 & 0x3F);
110
if (dst < 0x00000080UL) {
118
dst = ((t_quad) (b0 & 0x0f)) << 12;
119
t_byte b1 = (t_byte) src[1];
120
if ((b1 & 0x80) != 0x80) {
124
dst |= ((t_quad) (b1 & 0x3F)) << 6;
125
t_byte b2 = (t_byte) src[2];
126
if ((b2 & 0x80) != 0x80) {
130
dst |= (t_quad) (b2 & 0x3F);
131
if (dst < 0x00000800UL) {
139
dst = ((t_quad) (b0 & 0x07)) << 18;
140
t_byte b1 = (t_byte) src[1];
141
if ((b1 & 0x80) != 0x80) {
145
dst |= ((t_quad) (b1 & 0x3F)) << 12;
146
t_byte b2 = (t_byte) src[2];
147
if ((b2 & 0x80) != 0x80) {
151
dst |= (t_quad) (b2 & 0x3F) << 6;
152
t_byte b3 = (t_byte) src[3];
153
if ((b3 & 0x80) != 0x80) {
157
dst |= (t_quad) (b3 & 0x3F);
158
if (dst < 0x00010000UL) {
166
dst = ((t_quad) (b0 & 0x03)) << 24;
167
t_byte b1 = (t_byte) src[1];
168
if ((b1 & 0x80) != 0x80) {
172
dst |= ((t_quad) (b1 & 0x3F)) << 18;
173
t_byte b2 = (t_byte) src[2];
174
if ((b2 & 0x80) != 0x80) {
178
dst |= (t_quad) (b2 & 0x3F) << 12;
179
t_byte b3 = (t_byte) src[3];
180
if ((b3 & 0x80) != 0x80) {
184
dst |= (t_quad) (b3 & 0x3F) << 6;
185
t_byte b4 = (t_byte) src[4];
186
if ((b4 & 0x80) != 0x80) {
190
dst |= (t_quad) (b4 & 0x3F);
191
if (dst < 0x00200000UL) {
199
dst = ((t_quad) (b0 & 0x01)) << 30;
200
t_byte b1 = (t_byte) src[1];
201
if ((b1 & 0x80) != 0x80) {
205
dst |= ((t_quad) (b1 & 0x3F)) << 24;
206
t_byte b2 = (t_byte) src[2];
207
if ((b2 & 0x80) != 0x80) {
211
dst |= (t_quad) (b2 & 0x3F) << 18;
212
t_byte b3 = (t_byte) src[3];
213
if ((b3 & 0x80) != 0x80) {
217
dst |= (t_quad) (b3 & 0x3F) << 12;
218
t_byte b4 = (t_byte) src[4];
219
if ((b4 & 0x80) != 0x80) {
223
dst |= (t_quad) (b4 & 0x3F) << 6;
224
t_byte b5 = (t_byte) src[5];
225
if ((b5 & 0x80) != 0x80) {
229
dst |= (t_quad) (b5 & 0x3F);
230
if (dst < 0x04000000UL) {
239
// this procedure converts a quad into a utf8 buffer
240
static long qto_utf_08 (t_byte* dst, const t_quad c) {
242
if (dst == nilp) return 0;
246
if (c < 0x00000080UL) {
247
dst[i++] = (t_byte) c;
248
} else if (c < 0x00000800UL) {
249
dst[i++] = (t_byte) (0x000000C0UL | ((c >> 6) & 0x0000001FUL));
250
dst[i++] = (t_byte) (0x00000080UL | (c & 0x0000003FUL));
251
} else if (c < 0x00010000UL) {
252
dst[i++] = (t_byte) (0x000000E0UL | ((c >> 12) & 0x0000000FUL));
253
dst[i++] = (t_byte) (0x00000080UL | ((c >> 6) & 0x0000003FUL));
254
dst[i++] = (t_byte) (0x00000080UL | (c & 0x0000003FUL));
255
} else if (c < 0x00200000UL) {
256
dst[i++] = (t_byte) (0x000000F0UL | ((c >> 18) & 0x00000007UL));
257
dst[i++] = (t_byte) (0x00000080UL | ((c >> 12) & 0x0000003FUL));
258
dst[i++] = (t_byte) (0x00000080UL | ((c >> 6) & 0x0000003FUL));
259
dst[i++] = (t_byte) (0x00000080UL | (c & 0x0000003FUL));
260
} else if (c < 0x04000000UL) {
261
dst[i++] = (t_byte) (0x000000F8UL | ((c >> 24) & 0x00000003UL));
262
dst[i++] = (t_byte) (0x00000080UL | ((c >> 18) & 0x0000003FUL));
263
dst[i++] = (t_byte) (0x00000080UL | ((c >> 12) & 0x0000003FUL));
264
dst[i++] = (t_byte) (0x00000080UL | ((c >> 6) & 0x0000003FUL));
265
dst[i++] = (t_byte) (0x00000080UL | (c & 0x0000003FUL));
266
} else if (c < 0x80000000UL) {
267
dst[i++] = (t_byte) (0x000000FCUL | ((c >> 30) & 0x00000001UL));
268
dst[i++] = (t_byte) (0x00000080UL | ((c >> 24) & 0x0000003FUL));
269
dst[i++] = (t_byte) (0x00000080UL | ((c >> 18) & 0x0000003FUL));
270
dst[i++] = (t_byte) (0x00000080UL | ((c >> 12) & 0x0000003FUL));
271
dst[i++] = (t_byte) (0x00000080UL | ((c >> 6) & 0x0000003FUL));
272
dst[i++] = (t_byte) (0x00000080UL | (c & 0x0000003FUL));
279
// this procedure converts a char array to a quad array in byte mode
280
static t_quad* ctoq_byte (const char* s, const long size) {
281
// check for null size
282
if (size <= 0) return nilp;
283
// allocate the result and map
284
t_quad* result = new t_quad[size+1];
285
for (long i = 0; i < size; i++) result[i] = Unicode::toquad (s[i]);
286
// fix end of string and return
291
// this procedure converts a char array to a quad array in utf8 mode
292
static t_quad* ctoq_utf8 (const char* s, const long size) {
293
// check for null size
294
if (size <= 0) return nilp;
295
// prepare the buffer and index
298
t_byte buf[Unicode::MAX_UTF8_SIZE];
299
// allocate the result and map
300
t_quad* result = new t_quad[size+1];
301
for (long i = 0; i < size; i++) {
305
if (is_utf_08 (buf, pos) == false) {
306
// check for overflow
307
if (pos >= Unicode::MAX_UTF8_SIZE) {
308
throw Exception ("decode-error", "cannot decode utf8 buffer");
313
// here the buffer is valid - so convert it
315
if (utf_08_toq (c, buf) == false) {
316
throw Exception ("decode-error", "cannot decode utf8 buffer");
318
// save the quad and continue
322
// fix end of string and return
327
// this procedure converts a quad array to a char array in byte mode
328
static char* qtoc_byte (const t_quad* s, const long size) {
329
// check for null size
330
if (size <= 0) return nilp;
331
// allocate the result and map
332
char* result = new char[size+1];
333
for (long i = 0; i < size; i++) result[i] = Unicode::tochar (s[i]);
334
// fix end of string and return
339
// this procedure converts a quad array to a char array in utf8 mode
340
static char* qtoc_utf8 (const t_quad* s, const long size) {
341
// check for null size
342
if (size <= 0) return nilp;
343
long len = size * Unicode::MAX_UTF8_SIZE + 1;
345
// allocate the result and map
346
char* result = new char[len];
347
for (long i = 0; i < size; i++) {
348
// set conversion buffer
349
t_byte buf[Unicode::MAX_UTF8_SIZE];
350
long bsz = qto_utf_08 (buf, s[i]);
351
// check for proper result
353
throw Exception ("encode-error",
354
"invalid character to encode in utf8 mode");
357
for (long j = 0; j < bsz; j++) result[idx++] = buf[j];
359
// fix end of string and return
364
// -------------------------------------------------------------------------
365
// - public section -
366
// -------------------------------------------------------------------------
368
// convert a unicode character to a native character if possible
370
char Unicode::tochar (const t_quad value) {
371
// check for 8 bit range
372
if ((value & 0xFFFFFF00UL) != nilq) {
373
throw Exception ("unicode-error", "cannot convert unicode character");
376
char result = (char) (value & 0x000000FFUL);
380
// convert a unicode character to a bmp character if possible
382
t_word Unicode::tobmp (const t_quad value) {
383
// check for 16 bit range
384
if ((value & 0xFFFF0000UL) != nilq) {
385
throw Exception ("unicode-error", "cannot convert unicode character");
388
t_word result = (t_word) (value & 0x0000FFFFUL);
392
// convert a hexadecimal character to a byte
394
t_byte Unicode::htob (const t_quad value) {
395
char c = Unicode::tochar (value);
396
return Ascii::htob (c);
399
// convert a string into a byte array
401
t_byte* Unicode::stob (long& size, const String& s) {
402
// update the output size
403
long slen = s.length ();
405
if (size == 0) return nilp;
406
// check the string length
407
if ((slen % 2) != 0) {
408
throw Exception ("unicode-error",
409
"cannot convert string to byte array", s);
411
// preset start index
413
// check first two bytes
414
if ((s[0] == '0') && (s[1] == 'x')) {
418
throw Exception ("unicode-error",
419
"cannot convert string to byte array", s);
422
// allocate result array
423
t_byte* result = new t_byte[size];
424
for (long i = 0, j = si; i < size; i++) {
425
// get upper and lower quad
428
// get upper and lower byte
429
t_byte ub = Unicode::htob (uq);
430
t_byte lb = Unicode::htob (lq);
431
// set the byte value
432
result[i] = (ub << 4) | (lb & 0x0F);
438
// convert a native character to a unicode character
440
t_quad Unicode::toquad (const char value) {
441
t_quad result = value;
442
return result & 0x000000FFUL;
445
// convert a string representation to a character
447
t_quad Unicode::toquad (const String& value) {
448
long slen = value.length ();
449
// check for single character
451
t_quad result = value[0];
454
// check for ascii representation
455
if ((slen > 2) && (value[0] == '\'')) {
456
t_quad result = Unicode::toquad (Ascii::tochar (value));
459
// check for unicode representation
460
if ((slen > 2) && (value[0] == 'U') && (value[1] == '+')) {
462
String format = "0x";
463
format += value.rsubstr (2);
465
return (t_quad) Utility::tolong (format);
468
throw Exception ("format-error",
469
"illegal unicode string representation", value);
472
// convert a unicode character value to a string
474
String Unicode::tostring (const t_quad value) {
475
// check for an ascii character
476
if ((value & 0xFFFFFF00UL) == nilq) {
477
char cval = (char) (value & 0x000000FFUL);
478
String result = Ascii::tostring (cval);
481
// we are outside the ascii range, so use the unicode representation
482
String result = "U+";
483
result += Utility::tohexa (value);
487
// convert a native character value to a literal string
489
String Unicode::toliteral (const t_quad value) {
491
if (Unicode::isascii (value) == true) {
492
char cval = (char) (value & 0x000000FFUL);
498
result += Unicode::tostring (value);
504
// get the size of unicode array
506
long Unicode::strlen (const t_quad* s) {
507
// check for nil string
508
if (s == nilp) return 0;
511
while (*s++ != nilq) result++;
515
// compare two strings and returns true if they are equals.
517
bool Unicode::strcmp (const t_quad* s1, const bool n1, const char* s2) {
518
// normalize the string first
519
const t_quad* ns1 = n1 ? s1 : c_ucdnrm (s1, Unicode::strlen (s1));
520
const t_quad* ns2 = c_ucdnrm (s2, Ascii::strlen (s2));
521
// compute string length
522
long len1 = Unicode::strlen (ns1);
523
long len2 = Unicode::strlen (ns2);
524
// check length first
526
if (n1 == false) delete [] ns1;
532
for (long i = 0; i < len1; i++) {
533
if (ns1[i] != ns2[i]) {
538
// clean temporaries and return
539
if (n1 == false) delete [] ns1;
544
// compare two strings and returns true if they are equals.
546
bool Unicode::strcmp (const t_quad* s1, const char* s2) {
547
return Unicode::strcmp (s1, false, s2);
550
// compare two strings and returns true if they are equals.
552
bool Unicode::strcmp (const t_quad* s1, const bool n1,
553
const t_quad* s2, const bool n2) {
554
// normalize the string first
555
const t_quad* ns1 = n1 ? s1 : c_ucdnrm (s1, Unicode::strlen (s1));
556
const t_quad* ns2 = n2 ? s2 : c_ucdnrm (s2, Unicode::strlen (s2));
557
// compute string length
558
long len1 = Unicode::strlen (ns1);
559
long len2 = Unicode::strlen (ns2);
560
// check length first
562
if (n1 == false) delete [] ns1;
563
if (n2 == false) delete [] ns2;
568
for (long i = 0; i < len1; i++) {
569
if (ns1[i] != ns2[i]) {
574
// clean temporaries and return
575
if (n1 == false) delete [] ns1;
576
if (n2 == false) delete [] ns2;
580
// compare two strings and returns true if they are equals.
582
bool Unicode::strcmp (const t_quad* s1, const t_quad* s2) {
583
return Unicode::strcmp (s1, false, s2, false);
586
// compare two strings upto n characters
588
bool Unicode::strncmp (const t_quad* s1, const char* s2, const long size) {
590
if (size == 0) return true;
591
// normalize the string first
592
t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
593
t_quad* ns2 = c_ucdnrm (s2, Ascii::strlen (s2));
594
// compute string length
595
long len1 = Unicode::strlen (ns1);
596
long len2 = Unicode::strlen (ns2);
597
// check length first
598
if ((len1 < size) || (len2 < size)) {
605
for (long i = 0; i < size; i++) {
606
if (ns1[i] != ns2[i]) {
611
// clean temporaries and return
617
// compare two strings upto n characters
619
bool Unicode::strncmp (const t_quad* s1, const t_quad* s2, const long size) {
621
if (size == 0) return true;
622
// normalize the string first
623
t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
624
t_quad* ns2 = c_ucdnrm (s2, Unicode::strlen (s2));
625
// compute string length
626
long len1 = Unicode::strlen (ns1);
627
long len2 = Unicode::strlen (ns2);
628
// check length first
629
if ((len1 < size) || (len2 < size)) {
636
for (long i = 0; i < size; i++) {
637
if (ns1[i] != ns2[i]) {
642
// clean temporaries and return
648
// compare two strings - less than operator
650
bool Unicode::strlth (const t_quad* s1, const char* s2) {
651
// normalize the string first
652
t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
653
t_quad* ns2 = c_ucdnrm (s2, Ascii::strlen (s2));
654
// save pointers for delete
657
// compare without equal
659
while (*ns1 != nilq) {
664
if (*ns1++ > *ns2++) break;
666
// clean temporaries and return
672
// compare two strings - less than operator
674
bool Unicode::strlth (const t_quad* s1, const t_quad* s2) {
675
// normalize the string first
676
t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
677
t_quad* ns2 = c_ucdnrm (s2, Unicode::strlen (s2));
678
// save pointers for delete
681
// compare without equal
683
while (*ns1 != nilq) {
688
if (*ns1++ > *ns2++) break;
690
// clean temporaries and return
696
// compare two strings - less equal operator
698
bool Unicode::strleq (const t_quad* s1, const char* s2) {
699
// normalize the string first
700
t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
701
t_quad* ns2 = c_ucdnrm (s2, Ascii::strlen (s2));
702
// save pointers for delete
705
// compare with equal
707
while (*ns1 != nilq) {
708
if (*ns1 < *ns2) break;
709
if (*ns1++ > *ns2++) {
714
// clean temporaries and return
720
// compare two strings - less equal operator
722
bool Unicode::strleq (const t_quad* s1, const t_quad* s2) {
723
// normalize the string first
724
t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
725
t_quad* ns2 = c_ucdnrm (s2, Unicode::strlen (s2));
726
// save pointers for delete
729
// compare with equal
731
while (*ns1 != nilq) {
732
if (*ns1 < *ns2) break;
733
if (*ns1++ > *ns2++) {
738
// clean temporaries and return
744
// convert an ascii character to an unicode array
746
t_quad* Unicode::strmak (const char value) {
748
buf[0] = Unicode::toquad (value);
750
return strdup (buf, 1L);
753
// convert a unicode character to an unicode array
755
t_quad* Unicode::strmak (const t_quad value) {
759
return strdup (buf, 1L);
762
// create a unicode string from a string and a character
764
t_quad* Unicode::strmak (const t_quad* s, const char c) {
765
t_quad value = Unicode::toquad (c);
766
return Unicode::strmak (s, value);
769
// create a unicode string from a string one and a unicode character
771
t_quad* Unicode::strmak (const t_quad* s, const t_quad c) {
773
long len = Unicode::strlen (s);
774
t_quad* buf = new t_quad[len+2];
775
// copy string directly
776
for (long i = 0; i < len; i++) buf[i] = s[i];
779
// normalize and clean
781
t_quad* result = strdup (buf, len+1);
790
// create a unicode string from a character and a string
792
t_quad* Unicode::strmak (const char c, const t_quad* s) {
793
t_quad value = Unicode::toquad (c);
794
return Unicode::strmak (value, s);
797
// create a unicode string from a unicode character and a string
799
t_quad* Unicode::strmak (const t_quad c, const t_quad* s) {
801
long len = Unicode::strlen (s);
802
t_quad* buf = new t_quad[len+2];
803
// add character and copy string
805
for (long i = 0; i < len; i++) buf[i+1] = s[i];
807
// normalize and clean
809
t_quad* result = strdup (buf, len+1);
818
// concatenate two strings and normalize the result
820
t_quad* Unicode::strmak (const t_quad* s1, const char* s2) {
821
// compute arguments length
822
long len1 = Unicode::strlen (s1);
823
long len2 = Ascii::strlen (s2);
824
// allocate a temporary buffer and copy
825
t_quad* buf = new t_quad[len1+len2+1];
826
for (long i = 0; i < len1; i++) buf[i] = s1[i];
827
for (long i = 0; i < len2; i++) buf[len1+i] = Unicode::toquad (s2[i]);
828
buf[len1+len2] = nilq;
829
// normalize and clean
831
t_quad* result = strdup (buf, len1+len2);
840
// concatenate two strings and normalize the result
842
t_quad* Unicode::strmak (const t_quad* s1, const t_quad* s2) {
843
// compute arguments length
844
long len1 = Unicode::strlen (s1);
845
long len2 = Unicode::strlen (s2);
846
// allocate a temporary buffer and copy
847
t_quad* buf = new t_quad[len1+len2+1];
848
for (long i = 0; i < len1; i++) buf[i] = s1[i];
849
for (long i = 0; i < len2; i++) buf[len1+i] = s2[i];
850
buf[len1+len2] = nilq;
851
// normalize and clean
853
t_quad* result = strdup (buf, len1+len2);
862
// convert an ascii string to an unicode array
864
t_quad* Unicode::strdup (const char* s) {
865
return Unicode::strdup (s, false);
868
// convert a unicode string to an unicode array
870
t_quad* Unicode::strdup (const t_quad* s) {
871
return Unicode::strdup (s, false);
874
// convert an ascii string to an unicode array
876
t_quad* Unicode::strdup (const char* s, const bool nrmf) {
877
// get the buffer length and check for nil
878
long len = Ascii::strlen (s);
879
// convert the source buffer
880
t_quad* buf = new t_quad[len+1];
882
for (long i = 0; i < len; i++) buf[i] = Unicode::toquad (s[i]);
884
t_quad* result = buf;
886
result = c_ucdnrm (buf, len);
896
// convert a unicode string to an unicode array
898
t_quad* Unicode::strdup (const t_quad* s, const bool nrmf) {
899
// get the string length and check for nil
900
long len = Unicode::strlen (s);
901
// create a new quad array
902
t_quad* buf = new t_quad[len+1];
903
// copy the source buffer
905
for (long i = 0; i < len; i++) buf[i] = s[i];
907
t_quad* result = buf;
909
result = c_ucdnrm (buf, len);
919
// convert a character buffer to an unicode array by size
921
t_quad* Unicode::strdup (const char* s, const long size) {
922
// create a new quad array
923
t_quad* result = new t_quad[size+1];
924
// convert the source buffer
926
for (long i = 0; i < size; i++) result[i] = Unicode::toquad (s[i]);
935
// convert a unicode string to an unicode array by size
937
t_quad* Unicode::strdup (const t_quad* s, const long size) {
938
// create a new quad array
939
t_quad* result = new t_quad[size+1];
940
// copy the source buffer
942
for (long i = 0; i < size; i++) result[i] = s[i];
951
// normalize a string by performing a normal form decomposition
953
t_quad* Unicode::strnrm (const t_quad* s) {
954
// get the string length and check for nil
955
long len = Unicode::strlen (s);
956
// normalize the string
957
return c_ucdnrm (s, len);
960
// remove the leading blank and tab and return a new string
962
t_quad* Unicode::stripl (const char* s) {
964
while ((*s != nilc) && ((*s == blkc) || (*s == tabc))) s++;
966
return Unicode::strdup (s);
969
// remove the leading separators and return a new string
971
t_quad* Unicode::stripl (const char* s, const char* sep) {
973
while ((*s != nilc) && (is_csep (*s, sep) == true)) s++;
975
return Unicode::strdup (s);
978
// remove the leading blank and tab and return a new string
980
t_quad* Unicode::stripl (const t_quad* s) {
982
while ((*s != nilq) && ((*s == blkq) || (*s == tabq))) s++;
984
return Unicode::strdup (s);
987
// remove the leading separators and return a new string
989
t_quad* Unicode::stripl (const t_quad* s, const t_quad* sep) {
991
while ((*s != nilq) && (is_qsep (*s, sep) == true)) s++;
993
return Unicode::strdup (s);
996
// remove the trailing blank and return a new string
998
t_quad* Unicode::stripr (const char* s) {
999
// get the length and check
1000
long len = Ascii::strlen (s);
1001
if (len == 0) return c_ucdnil ();
1002
char* buf = Ascii::strdup (s);
1003
char* end = buf + len - 1;
1004
// remove trailing blank
1005
while ((end != s) && ((*end == blkc) || (*end == tabc))) *end-- = nilc;
1006
// now copy and return
1007
t_quad* result = Unicode::strdup (buf);
1012
// remove the trailing separators and return a new string
1014
t_quad* Unicode::stripr (const char* s, const char* sep) {
1015
// get the length and check
1016
long len = Ascii::strlen (s);
1017
if (len == 0) return c_ucdnil ();
1018
char* buf = Ascii::strdup (s);
1019
char* end = buf + len - 1;
1020
// remove trailing blank
1021
while ((end != s) && (is_csep (*end, sep) == true)) *end-- = nilc;
1022
// now copy and return
1023
t_quad* result = Unicode::strdup (buf);
1028
// remove the trailing blank and return a new string
1030
t_quad* Unicode::stripr (const t_quad* s) {
1031
// get the length and check
1032
long len = Unicode::strlen (s);
1033
if (len == 0) return c_ucdnil ();
1034
t_quad* buf = Unicode::strdup (s);
1035
t_quad* end = buf + len - 1;
1036
// remove trailing blank
1037
while ((end != s) && ((*end == blkq) || (*end == tabq))) *end-- = nilq;
1038
// now copy and return
1039
t_quad* result = Unicode::strdup (buf);
1044
// remove the trailing separators and return a new string
1046
t_quad* Unicode::stripr (const t_quad* s, const t_quad* sep) {
1047
// get the length and check
1048
long len = Unicode::strlen (s);
1049
if (len == 0) return c_ucdnil ();
1050
t_quad* buf = Unicode::strdup (s);
1051
t_quad* end = buf + len - 1;
1052
// remove trailing blank
1053
while ((end != s) && (is_qsep (*end, sep) == true)) *end-- = nilq;
1054
// now copy and return
1055
t_quad* result = Unicode::strdup (buf);
1060
// convert an ascii string to lower case
1062
t_quad* Unicode::tolower (const char* s) {
1064
if (s == nilp) return c_ucdnil ();
1065
long len = Ascii::strlen (s);
1066
// allocate and convert
1067
long size = len * UCD_LCM_MAX + 1;
1068
t_quad* sbuf = new t_quad[size];
1070
t_quad sdst[UCD_LCM_MAX];
1071
for (long i = 0; i < len; i++) {
1072
long cnvs = c_ucdtol (sdst, Unicode::toquad (s[i]));
1073
for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
1075
// mark end of string
1078
t_quad* result = Unicode::strdup (sbuf);
1083
// convert an ascii string to lower case
1085
t_quad* Unicode::tolower (const t_quad* s) {
1087
if (s == nilp) return c_ucdnil ();
1088
long len = Unicode::strlen (s);
1089
// allocate and convert
1090
long size = len * UCD_LCM_MAX + 1;
1091
t_quad* sbuf = new t_quad[size];
1093
t_quad sdst[UCD_LCM_MAX];
1094
for (long i = 0; i < len; i++) {
1095
long cnvs = c_ucdtol (sdst, s[i]);
1096
for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
1098
// mark end of string
1101
t_quad* result = Unicode::strdup (sbuf);
1106
// convert an ascii string to upper case
1108
t_quad* Unicode::toupper (const char* s) {
1110
if (s == nilp) return c_ucdnil ();
1111
long len = Ascii::strlen (s);
1112
// allocate and convert
1113
long size = len * UCD_UCM_MAX + 1;
1114
t_quad* sbuf = new t_quad[size];
1116
t_quad sdst[UCD_UCM_MAX];
1117
for (long i = 0; i < len; i++) {
1118
long cnvs = c_ucdtou (sdst, Unicode::toquad (s[i]));
1119
for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
1121
// mark end of string
1124
t_quad* result = Unicode::strdup (sbuf);
1129
// convert an unicode string to upper case
1131
t_quad* Unicode::toupper (const t_quad* s) {
1133
if (s == nilp) return c_ucdnil ();
1134
long len = Unicode::strlen (s);
1135
// allocate and convert
1136
long size = len * UCD_UCM_MAX + 1;
1137
t_quad* sbuf = new t_quad[size];
1139
t_quad sdst[UCD_UCM_MAX];
1140
for (long i = 0; i < len; i++) {
1141
long cnvs = c_ucdtou (sdst, s[i]);
1142
for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
1144
// mark end of string
1147
t_quad* result = Unicode::strdup (sbuf);
1152
// return true if the character is a lower character
1154
bool Unicode::islower (const t_quad code) {
1155
// get the ucd record and do nothing if it does not exist
1156
const ucd_s* ucd = c_getucd (code);
1157
if (ucd == nilp) return false;
1158
// check for lower case code
1159
return (ucd->d_pgcv == UCD_GCV_LL);
1162
// return true if the character is an upper character
1164
bool Unicode::isupper (const t_quad code) {
1165
// get the ucd record and do nothing if it does not exist
1166
const ucd_s* ucd = c_getucd (code);
1167
if (ucd == nilp) return false;
1168
// check for lower case code
1169
return (ucd->d_pgcv == UCD_GCV_LU);
1172
// return true if the unicode character is a letter
1174
bool Unicode::isletter (const t_quad code) {
1175
// get the ucd record and do nothing if it does not exist
1176
const ucd_s* ucd = c_getucd (code);
1177
if (ucd == nilp) return false;
1178
// get the gcv byte and check
1179
t_byte gcv = ucd->d_pgcv;
1180
if (gcv == UCD_GCV_LU) return true;
1181
if (gcv == UCD_GCV_LL) return true;
1182
if (gcv == UCD_GCV_LT) return true;
1183
if (gcv == UCD_GCV_LM) return true;
1184
if (gcv == UCD_GCV_LO) return true;
1188
// return true if the unicode character is a digit
1190
bool Unicode::isdigit (const t_quad code) {
1191
// get the ucd record and do nothing if it does not exist
1192
const ucd_s* ucd = c_getucd (code);
1193
if (ucd == nilp) return false;
1194
// get the gcv byte and check
1195
t_byte gcv = ucd->d_pgcv;
1196
if (gcv == UCD_GCV_ND) return true;
1200
// return true if the unicode character is a combining alphanumeric
1202
bool Unicode::iscan (const t_quad code) {
1203
// get the ucd record and do nothing if it does not exist
1204
const ucd_s* ucd = c_getucd (code);
1205
if (ucd == nilp) return false;
1207
t_byte gcv = ucd->d_pgcv;
1209
if (gcv == UCD_GCV_LU) return true;
1210
if (gcv == UCD_GCV_LL) return true;
1211
if (gcv == UCD_GCV_LT) return true;
1212
if (gcv == UCD_GCV_LM) return true;
1213
if (gcv == UCD_GCV_LO) return true;
1214
// check for marking
1215
if (gcv == UCD_GCV_MN) return true;
1217
if (gcv == UCD_GCV_ND) return true;
1222
// return true if the unicode character is an alpha-numeric character
1224
bool Unicode::isalpha (const t_quad code) {
1225
// check for a digit
1226
if (Unicode::isdigit (code) == true) return true;
1228
if (Unicode::isletter (code) == true) return true;
1234
// return true if the unicode character is a blank or tab
1236
bool Unicode::isblank (const t_quad code) {
1237
if ((code == blkq) || (code == tabq)) return true;
1241
// return true if the unicode character is an ascii character
1243
bool Unicode::isascii (const t_quad code) {
1244
if ((code & 0xFFFFFF80UL) == nilq) return true;
1248
// return true if the unicode character is a bmp character
1250
bool Unicode::isbmp (const t_quad code) {
1251
if ((code & 0xFFFF0000UL) == nilq) return true;
1255
// return true if the unicode character is a latin character
1257
bool Unicode::islatin (const t_quad code) {
1258
if ((code & 0xFFFFFF00UL) == nilq) return true;
1262
// return true if the unicode character is a bit character
1264
bool Unicode::isbit (const t_quad code) {
1265
if ((code == (t_quad) '0') || (code == (t_quad) '1')) return true;
1269
// return true if the unicode character is an hexadecimal character
1271
bool Unicode::ishexa (const t_quad code) {
1272
if ((code >= (t_quad) '0') && (code <= (t_quad) '9')) return true;
1273
if ((code >= (t_quad) 'a') && (code <= (t_quad) 'f')) return true;
1274
if ((code >= (t_quad) 'A') && (code <= (t_quad) 'F')) return true;
1278
// return true if the character is an afnix constituent
1280
bool Unicode::isafnix (const t_quad code) {
1281
// check for an alhpa character
1282
if (isalpha (code) == true) return true;
1283
// check for other constituents
1284
if (code == (t_quad) '.') return true;
1285
if (code == (t_quad) '+') return true;
1286
if (code == (t_quad) '-') return true;
1287
if (code == (t_quad) '*') return true;
1288
if (code == (t_quad) '/') return true;
1289
if (code == (t_quad) '!') return true;
1290
if (code == (t_quad) '=') return true;
1291
if (code == (t_quad) '.') return true;
1292
if (code == (t_quad) '>') return true;
1293
if (code == (t_quad) '<') return true;
1294
if (code == (t_quad) '?') return true;
1298
// return true if the unicode character is a valid terminal character
1300
bool Unicode::isterm (const t_quad code) {
1301
// get the ucd record and do nothing if it does not exist
1302
const ucd_s* ucd = c_getucd (code);
1303
if (ucd == nilp) return false;
1305
t_byte gcv = ucd->d_pgcv;
1307
if (gcv == UCD_GCV_LU) return true;
1308
if (gcv == UCD_GCV_LL) return true;
1309
if (gcv == UCD_GCV_LT) return true;
1310
if (gcv == UCD_GCV_LM) return true;
1311
if (gcv == UCD_GCV_LO) return true;
1312
// check for marking
1313
if (gcv == UCD_GCV_MN) return true;
1314
if (gcv == UCD_GCV_MC) return true;
1315
if (gcv == UCD_GCV_ME) return true;
1317
if (gcv == UCD_GCV_ND) return true;
1318
if (gcv == UCD_GCV_NL) return true;
1319
if (gcv == UCD_GCV_NO) return true;
1320
// check for punctuation
1321
if (gcv == UCD_GCV_PC) return true;
1322
if (gcv == UCD_GCV_PD) return true;
1323
if (gcv == UCD_GCV_PS) return true;
1324
if (gcv == UCD_GCV_PE) return true;
1325
if (gcv == UCD_GCV_PI) return true;
1326
if (gcv == UCD_GCV_PF) return true;
1327
if (gcv == UCD_GCV_PO) return true;
1329
if (gcv == UCD_GCV_SM) return true;
1330
if (gcv == UCD_GCV_SC) return true;
1331
if (gcv == UCD_GCV_SK) return true;
1332
if (gcv == UCD_GCV_SO) return true;
1333
// check for spacing
1334
if (gcv == UCD_GCV_ZS) return true;
1335
// not for a terminal
1340
// return true if the character is a word constituent
1342
bool Unicode::iswcc (const t_quad code) {
1343
// get the ucd record and do nothing if it does not exist
1344
const ucd_s* ucd = c_getucd (code);
1345
if (ucd == nilp) return false;
1347
t_byte gcv = ucd->d_pgcv;
1349
if (gcv == UCD_GCV_LU) return true;
1350
if (gcv == UCD_GCV_LL) return true;
1351
if (gcv == UCD_GCV_LT) return true;
1352
if (gcv == UCD_GCV_LM) return true;
1353
if (gcv == UCD_GCV_LO) return true;
1354
// check for marking
1355
if (gcv == UCD_GCV_MN) return true;
1356
if (gcv == UCD_GCV_MC) return true;
1357
if (gcv == UCD_GCV_ME) return true;
1359
if (gcv == UCD_GCV_ND) return true;
1360
if (gcv == UCD_GCV_NL) return true;
1361
if (gcv == UCD_GCV_NO) return true;
1362
// not for a terminal
1366
// return true if the character is a non combining character
1368
bool Unicode::isncc (const t_quad code) {
1369
return c_ucdncc (code);
1372
// get the non-combining length of a unicode string
1374
long Unicode::ncclen (const t_quad* s) {
1375
// check for nil string
1376
if (s == nilp) return 0;
1377
// compute length by counting only the grapheme
1379
while (*s != nilq) {
1380
if (c_ucdncc (*s++) == true) result++;
1385
// encode a unicode character depending on the mode
1387
char* Unicode::encode (const Encoding::t_emod emod, const t_quad c) {
1388
return Unicode::encode (emod, &c, 1);
1391
// encode a string based on a mode
1393
char* Unicode::encode (const Encoding::t_emod emod, const String& s) {
1394
t_quad* sbuf = s.toquad ();
1396
char* cbuf = Unicode::encode (emod, sbuf);
1405
// encode a unicode string depending on the mode
1407
char* Unicode::encode (const Encoding::t_emod emod, const t_quad* s) {
1408
// get the buffer size
1409
long size = Unicode::strlen (s);
1410
// encode by mode and size
1411
return encode (emod, s, size);
1414
// encode a unicode string buffer depending on the mode
1416
char* Unicode::encode (const Encoding::t_emod emod, const t_quad* s,
1419
if (size <= 0) return nilp;
1420
// prepare the result
1421
char* result = nilp;
1424
case Encoding::BYTE:
1425
result = qtoc_byte (s, size);
1427
case Encoding::UTF8:
1428
result = qtoc_utf8 (s, size);
1435
// check if a buffer is a valid encoding
1437
bool Unicode::valid (const Encoding::t_emod emod, const char* s,
1439
// initialize result
1440
bool result = false;
1441
// check based on mode
1443
case Encoding::BYTE:
1446
case Encoding::UTF8:
1447
result = is_utf_08 ((const t_byte*) s, size);
1454
// decode a unicode buffer into a quad
1456
t_quad Unicode::decode (const char* buf) {
1457
// trivial check first
1459
throw Exception ("decode-error", "invalid buffer for unicode decoding");
1461
// convert the buffer
1462
t_quad result = nilq;
1463
if (utf_08_toq (result, (const t_byte*) buf) == false) {
1464
throw Exception ("decode-error", "cannot decode utf8 buffer");
1469
// decode a unicode buffer
1471
t_quad* Unicode::decode (const Encoding::t_emod emod, const char* s) {
1472
// get the buffer size
1473
long size = Ascii::strlen (s);
1474
// decode by mode and size
1475
return Unicode::decode (emod, s, size);
1478
// decode a unicode buffer by mode and size
1480
t_quad* Unicode::decode (const Encoding::t_emod emod, const char* s,
1483
if (size <= 0) return nilp;
1484
// prepare the result
1485
t_quad* result = nilp;
1488
case Encoding::BYTE:
1489
result = ctoq_byte (s, size);
1491
case Encoding::UTF8:
1492
result = ctoq_utf8 (s, size);