2
* libid3tag - ID3 tag manipulation library
3
* Copyright (C) 2000-2004 Underbit Technologies, Inc.
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
* $Id: utf8.c,v 1.9 2004/01/23 09:41:32 rob Exp $
35
* NAME: utf8->length()
36
* DESCRIPTION: return the number of ucs4 chars represented by a utf8 string
38
id3_length_t id3_utf8_length(id3_utf8_t const *utf8)
40
id3_length_t length = 0;
43
if ((utf8[0] & 0x80) == 0x00)
45
else if ((utf8[0] & 0xe0) == 0xc0 &&
46
(utf8[1] & 0xc0) == 0x80) {
47
if (((utf8[0] & 0x1fL) << 6) >= 0x00000080L) {
52
else if ((utf8[0] & 0xf0) == 0xe0 &&
53
(utf8[1] & 0xc0) == 0x80 &&
54
(utf8[2] & 0xc0) == 0x80) {
55
if ((((utf8[0] & 0x0fL) << 12) |
56
((utf8[1] & 0x3fL) << 6)) >= 0x00000800L) {
61
else if ((utf8[0] & 0xf8) == 0xf0 &&
62
(utf8[1] & 0xc0) == 0x80 &&
63
(utf8[2] & 0xc0) == 0x80 &&
64
(utf8[3] & 0xc0) == 0x80) {
65
if ((((utf8[0] & 0x07L) << 18) |
66
((utf8[1] & 0x3fL) << 12)) >= 0x00010000L) {
71
else if ((utf8[0] & 0xfc) == 0xf8 &&
72
(utf8[1] & 0xc0) == 0x80 &&
73
(utf8[2] & 0xc0) == 0x80 &&
74
(utf8[3] & 0xc0) == 0x80 &&
75
(utf8[4] & 0xc0) == 0x80) {
76
if ((((utf8[0] & 0x03L) << 24) |
77
((utf8[0] & 0x3fL) << 18)) >= 0x00200000L) {
82
else if ((utf8[0] & 0xfe) == 0xfc &&
83
(utf8[1] & 0xc0) == 0x80 &&
84
(utf8[2] & 0xc0) == 0x80 &&
85
(utf8[3] & 0xc0) == 0x80 &&
86
(utf8[4] & 0xc0) == 0x80 &&
87
(utf8[5] & 0xc0) == 0x80) {
88
if ((((utf8[0] & 0x01L) << 30) |
89
((utf8[0] & 0x3fL) << 24)) >= 0x04000000L) {
103
* DESCRIPTION: return the encoding size of a utf8 string
105
id3_length_t id3_utf8_size(id3_utf8_t const *utf8)
107
id3_utf8_t const *ptr = utf8;
112
return ptr - utf8 + 1;
116
* NAME: utf8->ucs4duplicate()
117
* DESCRIPTION: duplicate and decode a utf8 string into ucs4
119
id3_ucs4_t *id3_utf8_ucs4duplicate(id3_utf8_t const *utf8)
123
ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
125
id3_utf8_decode(utf8, ucs4);
127
return release(ucs4);
131
* NAME: utf8->decodechar()
132
* DESCRIPTION: decode a series of utf8 chars into a single ucs4 char
134
id3_length_t id3_utf8_decodechar(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
136
id3_utf8_t const *start = utf8;
139
if ((utf8[0] & 0x80) == 0x00) {
141
return utf8 - start + 1;
143
else if ((utf8[0] & 0xe0) == 0xc0 &&
144
(utf8[1] & 0xc0) == 0x80) {
146
((utf8[0] & 0x1fL) << 6) |
147
((utf8[1] & 0x3fL) << 0);
148
if (*ucs4 >= 0x00000080L)
149
return utf8 - start + 2;
151
else if ((utf8[0] & 0xf0) == 0xe0 &&
152
(utf8[1] & 0xc0) == 0x80 &&
153
(utf8[2] & 0xc0) == 0x80) {
155
((utf8[0] & 0x0fL) << 12) |
156
((utf8[1] & 0x3fL) << 6) |
157
((utf8[2] & 0x3fL) << 0);
158
if (*ucs4 >= 0x00000800L)
159
return utf8 - start + 3;
161
else if ((utf8[0] & 0xf8) == 0xf0 &&
162
(utf8[1] & 0xc0) == 0x80 &&
163
(utf8[2] & 0xc0) == 0x80 &&
164
(utf8[3] & 0xc0) == 0x80) {
166
((utf8[0] & 0x07L) << 18) |
167
((utf8[1] & 0x3fL) << 12) |
168
((utf8[2] & 0x3fL) << 6) |
169
((utf8[3] & 0x3fL) << 0);
170
if (*ucs4 >= 0x00010000L)
171
return utf8 - start + 4;
173
else if ((utf8[0] & 0xfc) == 0xf8 &&
174
(utf8[1] & 0xc0) == 0x80 &&
175
(utf8[2] & 0xc0) == 0x80 &&
176
(utf8[3] & 0xc0) == 0x80 &&
177
(utf8[4] & 0xc0) == 0x80) {
179
((utf8[0] & 0x03L) << 24) |
180
((utf8[1] & 0x3fL) << 18) |
181
((utf8[2] & 0x3fL) << 12) |
182
((utf8[3] & 0x3fL) << 6) |
183
((utf8[4] & 0x3fL) << 0);
184
if (*ucs4 >= 0x00200000L)
185
return utf8 - start + 5;
187
else if ((utf8[0] & 0xfe) == 0xfc &&
188
(utf8[1] & 0xc0) == 0x80 &&
189
(utf8[2] & 0xc0) == 0x80 &&
190
(utf8[3] & 0xc0) == 0x80 &&
191
(utf8[4] & 0xc0) == 0x80 &&
192
(utf8[5] & 0xc0) == 0x80) {
194
((utf8[0] & 0x01L) << 30) |
195
((utf8[1] & 0x3fL) << 24) |
196
((utf8[2] & 0x3fL) << 18) |
197
((utf8[3] & 0x3fL) << 12) |
198
((utf8[4] & 0x3fL) << 6) |
199
((utf8[5] & 0x3fL) << 0);
200
if (*ucs4 >= 0x04000000L)
201
return utf8 - start + 6;
209
* NAME: utf8->encodechar()
210
* DESCRIPTION: encode a single ucs4 char into a series of up to 6 utf8 chars
212
id3_length_t id3_utf8_encodechar(id3_utf8_t *utf8, id3_ucs4_t ucs4)
214
if (ucs4 <= 0x0000007fL) {
219
else if (ucs4 <= 0x000007ffL) {
220
utf8[0] = 0xc0 | ((ucs4 >> 6) & 0x1f);
221
utf8[1] = 0x80 | ((ucs4 >> 0) & 0x3f);
225
else if (ucs4 <= 0x0000ffffL) {
226
utf8[0] = 0xe0 | ((ucs4 >> 12) & 0x0f);
227
utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
228
utf8[2] = 0x80 | ((ucs4 >> 0) & 0x3f);
232
else if (ucs4 <= 0x001fffffL) {
233
utf8[0] = 0xf0 | ((ucs4 >> 18) & 0x07);
234
utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
235
utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
236
utf8[3] = 0x80 | ((ucs4 >> 0) & 0x3f);
240
else if (ucs4 <= 0x03ffffffL) {
241
utf8[0] = 0xf8 | ((ucs4 >> 24) & 0x03);
242
utf8[1] = 0x80 | ((ucs4 >> 18) & 0x3f);
243
utf8[2] = 0x80 | ((ucs4 >> 12) & 0x3f);
244
utf8[3] = 0x80 | ((ucs4 >> 6) & 0x3f);
245
utf8[4] = 0x80 | ((ucs4 >> 0) & 0x3f);
249
else if (ucs4 <= 0x7fffffffL) {
250
utf8[0] = 0xfc | ((ucs4 >> 30) & 0x01);
251
utf8[1] = 0x80 | ((ucs4 >> 24) & 0x3f);
252
utf8[2] = 0x80 | ((ucs4 >> 18) & 0x3f);
253
utf8[3] = 0x80 | ((ucs4 >> 12) & 0x3f);
254
utf8[4] = 0x80 | ((ucs4 >> 6) & 0x3f);
255
utf8[5] = 0x80 | ((ucs4 >> 0) & 0x3f);
262
return id3_utf8_encodechar(utf8, ID3_UCS4_REPLACEMENTCHAR);
266
* NAME: utf8->decode()
267
* DESCRIPTION: decode a complete utf8 string into a ucs4 string
269
void id3_utf8_decode(id3_utf8_t const *utf8, id3_ucs4_t *ucs4)
272
utf8 += id3_utf8_decodechar(utf8, ucs4);
277
* NAME: utf8->encode()
278
* DESCRIPTION: encode a complete ucs4 string into a utf8 string
280
void id3_utf8_encode(id3_utf8_t *utf8, id3_ucs4_t const *ucs4)
283
utf8 += id3_utf8_encodechar(utf8, *ucs4);
289
* DESCRIPTION: serialize a single utf8 character
291
id3_length_t id3_utf8_put(id3_byte_t **ptr, id3_utf8_t utf8)
301
* DESCRIPTION: deserialize a single utf8 character
303
id3_utf8_t id3_utf8_get(id3_byte_t const **ptr)
309
* NAME: utf8->serialize()
310
* DESCRIPTION: serialize a ucs4 string using utf8 encoding
312
id3_length_t id3_utf8_serialize(id3_byte_t **ptr, id3_ucs4_t const *ucs4,
315
id3_length_t size = 0;
316
id3_utf8_t utf8[6], *out;
319
switch (id3_utf8_encodechar(out = utf8, *ucs4++)) {
320
case 6: size += id3_utf8_put(ptr, *out++);
321
case 5: size += id3_utf8_put(ptr, *out++);
322
case 4: size += id3_utf8_put(ptr, *out++);
323
case 3: size += id3_utf8_put(ptr, *out++);
324
case 2: size += id3_utf8_put(ptr, *out++);
325
case 1: size += id3_utf8_put(ptr, *out++);
331
size += id3_utf8_put(ptr, 0);
337
* NAME: utf8->deserialize()
338
* DESCRIPTION: deserialize a ucs4 string using utf8 encoding
340
id3_ucs4_t *id3_utf8_deserialize(id3_byte_t const **ptr, id3_length_t length)
342
id3_byte_t const *end;
343
id3_utf8_t *utf8ptr, *utf8;
348
utf8 = malloc((length + 1) * sizeof(*utf8));
353
while (end - *ptr > 0 && (*utf8ptr = id3_utf8_get(ptr)))
358
ucs4 = malloc((id3_utf8_length(utf8) + 1) * sizeof(*ucs4));
360
id3_utf8_decode(utf8, ucs4);