2
* Copyright (c) 2003-2007 Tim Kientzle
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
14
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17
* IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
#include "archive_platform.h"
27
__FBSDID("$FreeBSD: head/lib/libarchive/archive_string.c 201095 2009-12-28 02:33:22Z kientzle $");
30
* Basic resizable string support, to simplify manipulating arbitrary-sized
31
* strings while minimizing heap activity.
43
#if defined(_WIN32) && !defined(__CYGWIN__)
47
#include "archive_private.h"
48
#include "archive_string.h"
50
struct archive_string *
51
__archive_string_append(struct archive_string *as, const char *p, size_t s)
53
if (__archive_string_ensure(as, as->length + s + 1) == NULL)
54
__archive_errx(1, "Out of memory");
55
memcpy(as->s + as->length, p, s);
56
as->s[as->length + s] = 0;
62
__archive_string_copy(struct archive_string *dest, struct archive_string *src)
67
if (__archive_string_ensure(dest, src->length + 1) == NULL)
68
__archive_errx(1, "Out of memory");
69
memcpy(dest->s, src->s, src->length);
70
dest->length = src->length;
71
dest->s[dest->length] = 0;
76
__archive_string_concat(struct archive_string *dest, struct archive_string *src)
78
if (src->length > 0) {
79
if (__archive_string_ensure(dest, dest->length + src->length + 1) == NULL)
80
__archive_errx(1, "Out of memory");
81
memcpy(dest->s + dest->length, src->s, src->length);
82
dest->length += src->length;
83
dest->s[dest->length] = 0;
88
__archive_string_free(struct archive_string *as)
91
as->buffer_length = 0;
98
/* Returns NULL on any allocation failure. */
99
struct archive_string *
100
__archive_string_ensure(struct archive_string *as, size_t s)
102
/* If buffer is already big enough, don't reallocate. */
103
if (as->s && (s <= as->buffer_length))
107
* Growing the buffer at least exponentially ensures that
108
* append operations are always linear in the number of
109
* characters appended. Using a smaller growth rate for
110
* larger buffers reduces memory waste somewhat at the cost of
111
* a larger constant factor.
113
if (as->buffer_length < 32)
114
/* Start with a minimum 32-character buffer. */
115
as->buffer_length = 32;
116
else if (as->buffer_length < 8192)
117
/* Buffers under 8k are doubled for speed. */
118
as->buffer_length += as->buffer_length;
120
/* Buffers 8k and over grow by at least 25% each time. */
121
size_t old_length = as->buffer_length;
122
as->buffer_length += as->buffer_length / 4;
123
/* Be safe: If size wraps, release buffer and return NULL. */
124
if (as->buffer_length < old_length) {
131
* The computation above is a lower limit to how much we'll
132
* grow the buffer. In any case, we have to grow it enough to
135
if (as->buffer_length < s)
136
as->buffer_length = s;
137
/* Now we can reallocate the buffer. */
138
as->s = (char *)realloc(as->s, as->buffer_length);
144
struct archive_string *
145
__archive_strncat(struct archive_string *as, const void *_p, size_t n)
150
p = (const char *)_p;
152
/* Like strlen(p), except won't examine positions beyond p[n]. */
155
while (*pp && s < n) {
159
return (__archive_string_append(as, p, s));
162
struct archive_string *
163
__archive_strappend_char(struct archive_string *as, char c)
165
return (__archive_string_append(as, &c, 1));
169
* Translates a wide character string into UTF-8 and appends
170
* to the archive_string. Note: returns NULL if conversion fails,
171
* but still leaves a best-effort conversion in the argument as.
173
struct archive_string *
174
__archive_strappend_w_utf8(struct archive_string *as, const wchar_t *w)
179
struct archive_string *return_val = as;
182
* Convert one wide char at a time into 'buff', whenever that
183
* fills, append it to the string.
186
while (*w != L'\0') {
187
/* Flush the buffer when we have <=16 bytes free. */
188
/* (No encoding has a single character >16 bytes.) */
189
if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - 16)) {
191
archive_strcat(as, buff);
195
/* If this is a surrogate pair, assemble the full code point.*/
196
/* Note: wc must not be wchar_t here, because the full code
197
* point can be more than 16 bits! */
198
if (wc >= 0xD800 && wc <= 0xDBff
199
&& *w >= 0xDC00 && *w <= 0xDFFF) {
206
/* Translate code point to UTF8 */
209
} else if (wc <= 0x7ff) {
210
*p++ = 0xc0 | ((wc >> 6) & 0x1f);
211
*p++ = 0x80 | (wc & 0x3f);
212
} else if (wc <= 0xffff) {
213
*p++ = 0xe0 | ((wc >> 12) & 0x0f);
214
*p++ = 0x80 | ((wc >> 6) & 0x3f);
215
*p++ = 0x80 | (wc & 0x3f);
216
} else if (wc <= 0x1fffff) {
217
*p++ = 0xf0 | ((wc >> 18) & 0x07);
218
*p++ = 0x80 | ((wc >> 12) & 0x3f);
219
*p++ = 0x80 | ((wc >> 6) & 0x3f);
220
*p++ = 0x80 | (wc & 0x3f);
222
/* Unicode has no codes larger than 0x1fffff. */
223
/* TODO: use \uXXXX escape here instead of ? */
229
archive_strcat(as, buff);
234
utf8_to_unicode(int *pwc, const char *s, size_t n)
239
* Decode 1-4 bytes depending on the value of the first byte.
241
ch = (unsigned char)*s;
243
return (0); /* Standard: return 0 for end-of-string. */
245
if ((ch & 0x80) == 0) {
249
if ((ch & 0xe0) == 0xc0) {
252
if ((s[1] & 0xc0) != 0x80) return (-1);
253
*pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);
256
if ((ch & 0xf0) == 0xe0) {
259
if ((s[1] & 0xc0) != 0x80) return (-1);
260
if ((s[2] & 0xc0) != 0x80) return (-1);
261
*pwc = ((ch & 0x0f) << 12)
262
| ((s[1] & 0x3f) << 6)
266
if ((ch & 0xf8) == 0xf0) {
269
if ((s[1] & 0xc0) != 0x80) return (-1);
270
if ((s[2] & 0xc0) != 0x80) return (-1);
271
if ((s[3] & 0xc0) != 0x80) return (-1);
272
*pwc = ((ch & 0x07) << 18)
273
| ((s[1] & 0x3f) << 12)
274
| ((s[2] & 0x3f) << 6)
278
/* Invalid first byte. */
283
* Return a wide-character Unicode string by converting this archive_string
284
* from UTF-8. We assume that systems with 16-bit wchar_t always use
285
* UTF16 and systems with 32-bit wchar_t can accept UCS4.
288
__archive_string_utf8_w(struct archive_string *as)
291
int wc, wc2;/* Must be large enough for a 21-bit Unicode code point. */
295
ws = (wchar_t *)malloc((as->length + 1) * sizeof(wchar_t));
297
__archive_errx(1, "Out of memory");
300
while (*src != '\0') {
301
n = utf8_to_unicode(&wc, src, 8);
309
if (wc >= 0xDC00 && wc <= 0xDBFF) {
310
/* This is a leading surrogate; some idiot
311
* has translated UTF16 to UTF8 without combining
312
* surrogates; rebuild the full code point before
314
n = utf8_to_unicode(&wc2, src, 8);
319
if (n == 0) /* Ignore the leading surrogate */
321
if (wc2 < 0xDC00 || wc2 > 0xDFFF) {
322
/* If the second character isn't a
323
* trailing surrogate, then someone
324
* has really screwed up and this is
336
if ((sizeof(wchar_t) < 4) && (wc > 0xffff)) {
337
/* We have a code point that won't fit into a
338
* wchar_t; convert it to a surrogate pair. */
340
*dest++ = ((wc >> 10) & 0x3ff) + 0xD800;
341
*dest++ = (wc & 0x3ff) + 0xDC00;
349
#if defined(_WIN32) && !defined(__CYGWIN__)
352
* Translates a wide character string into current locale character set
353
* and appends to the archive_string. Note: returns NULL if conversion
356
* Win32 builds use WideCharToMultiByte from the Windows API.
357
* (Maybe Cygwin should too? WideCharToMultiByte will know a
358
* lot more about local character encodings than the wcrtomb()
359
* wrapper is going to know.)
361
struct archive_string *
362
__archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)
366
BOOL useDefaultChar = FALSE;
372
__archive_errx(1, "Out of memory");
373
/* To check a useDefaultChar is to simulate error handling of
374
* the my_wcstombs() which is running on non Windows system with
376
* And to set NULL for last argument is necessary when a codepage
377
* is not CP_ACP(current locale).
379
l = WideCharToMultiByte(CP_ACP, 0, w, wl, p, l, NULL, &useDefaultChar);
384
__archive_string_append(as, p, l);
392
* Translates a wide character string into current locale character set
393
* and appends to the archive_string. Note: returns NULL if conversion
396
* Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion
397
* one character at a time. If a non-Windows platform doesn't have
398
* either of these, fall back to the built-in UTF8 conversion.
400
struct archive_string *
401
__archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)
403
#if !defined(HAVE_WCTOMB) && !defined(HAVE_WCRTOMB)
404
/* If there's no built-in locale support, fall back to UTF8 always. */
405
return __archive_strappend_w_utf8(as, w);
407
/* We cannot use the standard wcstombs() here because it
408
* cannot tell us how big the output buffer should be. So
409
* I've built a loop around wcrtomb() or wctomb() that
410
* converts a character at a time and resizes the string as
411
* needed. We prefer wcrtomb() when it's available because
412
* it's thread-safe. */
417
mbstate_t shift_state;
419
memset(&shift_state, 0, sizeof(shift_state));
421
/* Clear the shift state before starting. */
426
* Convert one wide char at a time into 'buff', whenever that
427
* fills, append it to the string.
430
while (*w != L'\0') {
431
/* Flush the buffer when we have <=16 bytes free. */
432
/* (No encoding has a single character >16 bytes.) */
433
if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - MB_CUR_MAX)) {
435
archive_strcat(as, buff);
439
n = wcrtomb(p, *w++, &shift_state);
448
archive_strcat(as, buff);
453
#endif /* _WIN32 && ! __CYGWIN__ */