1
// ---------------------------------------------------------------------------
3
// - standard object library - unicode functions class definition -
4
// ---------------------------------------------------------------------------
5
// - This program is free software; you can redistribute it and/or modify -
6
// - it provided that this copyright notice is kept intact. -
8
// - This program is distributed in the hope that it will be useful, but -
9
// - without any warranty; without even the implied warranty of -
10
// - merchantability or fitness for a particular purpose. In no event shall -
11
// - the copyright holder be liable for any direct, indirect, incidental or -
12
// - special damages arising in any way out of the use of this software. -
13
// ---------------------------------------------------------------------------
14
// - copyright (c) 1999-2011 amaury darsch -
15
// ---------------------------------------------------------------------------
17
#ifndef AFNIX_UNICODE_HPP
18
#define AFNIX_UNICODE_HPP
20
#ifndef AFNIX_ENCODING_HPP
21
#include "Encoding.hpp"
26
/// The Unicode class is a collection of static convenient functions that
27
/// manipulates the unicode representation. The internal representation of
28
/// a unicode character is quad type in host representation.
29
/// @author amaury darsch
33
/// the maximum utf 8 buffer size
34
static const long MAX_UTF8_SIZE = 6;
37
/// convert a unicode character to a native character if possible
38
/// @param value the value to convert
39
static char tochar (const t_quad value);
41
/// convert a unicode character to a bmp character if possible
42
/// @param value the value to convert
43
static t_word tobmp (const t_quad value);
45
/// convert a hexadecimal character to a byte
46
/// @param value the character to convert
47
static t_byte htob (const t_quad value);
49
/// convert a string into a byte array
50
/// @param s the string to convert
51
static t_byte* stob (long& size, const String& s);
53
/// convert a native character to a unicode character
54
/// @param value the value to convert
55
static t_quad toquad (const char value);
57
/// convert a string to a unicode character
58
/// @param value the string value representation to convert
59
static t_quad toquad (const String& value);
61
/// convert a unicode character to a string
62
/// @param value the native value to convert
63
static String tostring (const t_quad value);
65
/// convert a unicode character to a literal string
66
/// @param value the native value to convert
67
static String toliteral (const t_quad value);
69
/// @return the length of an unicode string
70
static long strlen (const t_quad* s);
72
/// compare two strings and returns true if they are equals.
73
/// @param s1 the first string
74
/// @param n1 the first normal flag
75
/// @param s2 the second string
76
/// @return true if the string are equals or both null
77
static bool strcmp (const t_quad* s1, const bool n1, const char* s2);
79
/// compare two strings and returns true if they are equals.
80
/// @param s1 the first string
81
/// @param s2 the second string
82
/// @return true if the string are equals or both null
83
static bool strcmp (const t_quad* s1, const char* s2);
85
/// compare two strings and returns true if they are equals.
86
/// @param s1 the first string
87
/// @param n1 the first normal flag
88
/// @param s2 the second string
89
/// @param n2 the second normal flag
90
/// @return true if the string are equals or both null
91
static bool strcmp (const t_quad* s1, const bool n1,
92
const t_quad* s2, const bool n2);
94
/// compare two strings and returns true if they are equals.
95
/// @param s1 the first string
96
/// @param s2 the second string
97
/// @return true if the string are equals or both null
98
static bool strcmp (const t_quad* s1, const t_quad* s2);
100
/// compare two strings and returns true if they are equals for a number of
101
/// characters. This function is safe with null pointer.
102
/// @param s1 the first string
103
/// @param s2 the second string
104
/// @param size the number of characters to compare
105
/// @return true if the string are equals or both null
106
static bool strncmp (const t_quad* s1, const char* s2, const long size);
108
/// compare two strings and returns true if they are equals for a number of
109
/// characters. This function is safe with null pointer.
110
/// @param s1 the first string
111
/// @param s2 the second string
112
/// @param size the number of characters to compare
113
/// @return true if the string are equals or both null
114
static bool strncmp (const t_quad* s1, const t_quad* s2, const long size);
116
/// compare two strings and returns true if s1 is less than s2.
117
/// @param s1 the first string
118
/// @param s2 the second string
119
/// @return true if the string are equals or both null
120
static bool strlth (const t_quad* s1, const char* s2);
122
/// compare two strings and returns true if s1 is less than s2.
123
/// @param s1 the first string
124
/// @param s2 the second string
125
/// @return true if the string are equals or both null
126
static bool strlth (const t_quad* s1, const t_quad* s2);
128
/// compare two strings and returns true if s1 is less equal than s2.
129
/// @param s1 the first string
130
/// @param s2 the second string
131
/// @return true if the string are equals or both null
132
static bool strleq (const t_quad* s1, const char* s2);
134
/// compare two strings and returns true if s1 is less equal than s2.
135
/// @param s1 the first string
136
/// @param s2 the second string
137
/// @return true if the string are equals or both null
138
static bool strleq (const t_quad* s1, const t_quad* s2);
140
/// @return an unicode array from an ascii character
141
static t_quad* strmak (const char value);
143
/// @return an unicode array from an unicode character
144
static t_quad* strmak (const t_quad value);
146
/// create a unicode string from a unicode string and a character
147
/// @param s the original string to use
148
/// @param c the character to add
149
static t_quad* strmak (const t_quad* s, const char c);
151
/// create a unicode string from a unicode string and a unicode character
152
/// @param s the original string to use
153
/// @param c the character to add
154
static t_quad* strmak (const t_quad* s, const t_quad c);
156
/// create a unicode string from a character and a unicode string
157
/// @param c the character to add
158
/// @param s the original string to use
159
static t_quad* strmak (const char c, const t_quad* s);
161
/// create a unicode string from a unicode character and a unicode string
162
/// @param c the character to add
163
/// @param s the original string to use
164
static t_quad* strmak (const t_quad c, const t_quad* s);
166
/// concatenate two strings and normalize the result
167
/// @param s1 the first string
168
/// @param s1 the second string
169
static t_quad* strmak (const t_quad* s1, const char* s2);
171
/// concatenate two strings and normalize the result
172
/// @param s1 the first string
173
/// @param s2 the second string
174
static t_quad* strmak (const t_quad* s1, const t_quad* s2);
176
/// @return an unicode array from an ascii string
177
static t_quad* strdup (const char* s);
179
/// @return an unicode array from an unicode string
180
static t_quad* strdup (const t_quad* s);
182
/// @return an unicode array from an ascii string
183
static t_quad* strdup (const char* s, const bool nrmf);
185
/// @return an unicode array from an unicode string
186
static t_quad* strdup (const t_quad* s, const bool nrmf);
188
/// @return an unicode array from a character buffer and size
189
static t_quad* strdup (const char* s, const long size);
191
/// @return an unicode array from an unicode string and size
192
static t_quad* strdup (const t_quad* s, const long size);
194
/// @return a string in a normal decomposition form
195
static t_quad* strnrm (const t_quad* s);
197
/// remove the leading blank and tab and return a new string
198
/// @param s the original string
199
/// @return a new clean string
200
static t_quad* stripl (const char* s);
202
/// remove the leading separators and return a new string
203
/// @param s the original string
204
/// @param sep the character separators
205
/// @return a new clean string
206
static t_quad* stripl (const char* s, const char* sep);
208
/// remove the leading blank and tab and return a new string
209
/// @param s the original string
210
/// @return a new clean string
211
static t_quad* stripl (const t_quad* s);
213
/// remove the leading separators and return a new string
214
/// @param s the original string
215
/// @param sep the character separators
216
/// @return a new clean string
217
static t_quad* stripl (const t_quad* s, const t_quad* sep);
219
/// remove the trailing blank and tab and return a new string
220
/// @param s the original string
221
/// @return a new clean string
222
static t_quad* stripr (const char* s);
224
/// remove the trailing separators and return a new string
225
/// @param s the original string
226
/// @param sep the space characters
227
/// @return a new clean string
228
static t_quad* stripr (const char* s, const char* sep);
230
/// remove the trailing blank and tab and return a new string
231
/// @param s the original string
232
/// @return a new clean string
233
static t_quad* stripr (const t_quad* s);
235
/// remove the trailing separators and return a new string
236
/// @param s the original string
237
/// @param sep the space characters
238
/// @return a new clean string
239
static t_quad* stripr (const t_quad* s, const t_quad* sep);
241
/// convert the string to lower case
242
/// @param s the original string
243
/// @return a new lower case string
244
static t_quad* tolower (const char* s);
246
/// convert the string to lower case
247
/// @param s the original string
248
/// @return a new lower case string
249
static t_quad* tolower (const t_quad* s);
251
/// convert the string to upper case
252
/// @param s the original string
253
/// @return a new upper case string
254
static t_quad* toupper (const char* s);
256
/// convert the string to upper case
257
/// @param s the original string
258
/// @return a new upper case string
259
static t_quad* toupper (const t_quad* s);
261
/// @return true if the unicode character is a lower character
262
static bool islower (const t_quad code);
264
/// @return true if the unicode character is an upper character
265
static bool isupper (const t_quad code);
267
/// @return true if the unicode character is a letter
268
static bool isletter (const t_quad code);
270
/// @return true if the unicode character is a digit
271
static bool isdigit (const t_quad code);
273
/// @return true if the unicode character is a combining alphanumeric
274
static bool iscan (const t_quad code);
276
/// @return true if the unicode character is alpha
277
static bool isalpha (const t_quad code);
279
/// @return true if the unicode character is a blank or tab
280
static bool isblank (const t_quad code);
282
/// @return true if the unicode character is an ascii character
283
static bool isascii (const t_quad code);
285
/// @return true if the unicode character is a bmp character
286
static bool isbmp (const t_quad code);
288
/// @return true if the unicode character is a latin character
289
static bool islatin (const t_quad code);
291
/// @return true if the unicode character is a bit code
292
static bool isbit (const t_quad code);
294
/// @return true if the unicode character is an hexadecimal character
295
static bool ishexa (const t_quad code);
297
/// @return true if the unicode character is an afnix character
298
static bool isafnix (const t_quad code);
300
/// @return true if the unicode character is a terminal character
301
static bool isterm (const t_quad code);
303
/// @return true if the unicode character is a word constituent
304
static bool iswcc (const t_quad code);
306
/// @return true if the unicode character is a non combining character
307
static bool isncc (const t_quad code);
309
/// @return the non-combining length of an unicode string
310
static long ncclen (const t_quad* s);
312
/// @return a unicode character encoding
313
static char* encode (const Encoding::t_emod emod, const t_quad c);
315
/// @return a string encoding of a string
316
static char* encode (const Encoding::t_emod emod, const String& s);
318
/// @return a string encoding of a unicode string
319
static char* encode (const Encoding::t_emod emod, const t_quad* s);
321
/// @return a string encoding of a buffer by size
322
static char* encode (const Encoding::t_emod emod, const t_quad* s,
325
/// @return true if a unicode buffer is valid
326
static bool valid (const Encoding::t_emod emod, const char* s,
329
/// decode a unicode buffer into a quad
330
/// @param buf the utf8 buffer to decode
331
static t_quad decode (const char* buf);
333
/// @return a unicode string by decoding a character buffer
334
static t_quad* decode (const Encoding::t_emod emod, const char* s);
336
/// @return a unicode string by decoding a character buffer by size
337
static t_quad* decode (const Encoding::t_emod emod, const char* s,