~ubuntu-branches/ubuntu/quantal/libarchive/quantal

« back to all changes in this revision

Viewing changes to libarchive/archive_string.c

Committer: Package Import Robot
Author(s): Andres Mejia
Date: 2012-02-23 19:29:24 UTC
mfrom: (8.1.10 sid)
Revision ID: package-import@ubuntu.com-20120223192924-73n4iedok5fwgsyr

Tags: 3.0.3-5

* Detect if locales or locales-all is installed for use with test suite.
* Bump Standards-Version to 3.9.3.

files added:
build/autoconf/config.rpath

build/autoconf/iconv.m4

build/autoconf/lib-ld.m4

build/autoconf/lib-link.m4

build/autoconf/lib-prefix.m4

cpio/test/test_option_0.c

debian/libarchive12.install

debian/libarchive12.lintian-overrides

doc/man/archive_entry_acl.3

doc/man/archive_entry_linkify.3

doc/man/archive_entry_paths.3

doc/man/archive_entry_perms.3

doc/man/archive_entry_stat.3

doc/man/archive_entry_time.3

doc/man/archive_read_set_options.3

doc/man/archive_write_set_options.3

doc/pdf/archive_entry_acl.3.pdf

doc/pdf/archive_entry_linkify.3.pdf

doc/pdf/archive_entry_paths.3.pdf

doc/pdf/archive_entry_perms.3.pdf

doc/pdf/archive_entry_stat.3.pdf

doc/pdf/archive_entry_time.3.pdf

doc/pdf/archive_read_set_options.3.pdf

doc/pdf/archive_write_set_options.3.pdf

doc/text/archive_entry_acl.3.txt

doc/text/archive_entry_linkify.3.txt

doc/text/archive_entry_paths.3.txt

doc/text/archive_entry_perms.3.txt

doc/text/archive_entry_stat.3.txt

doc/text/archive_entry_time.3.txt

doc/text/archive_read_set_options.3.txt

doc/text/archive_write_set_options.3.txt

doc/wiki/ManPageArchiveEntryAcl3.wiki

doc/wiki/ManPageArchiveEntryLinkify3.wiki

doc/wiki/ManPageArchiveEntryPaths3.wiki

doc/wiki/ManPageArchiveEntryPerms3.wiki

doc/wiki/ManPageArchiveEntryStat3.wiki

doc/wiki/ManPageArchiveEntryTime3.wiki

doc/wiki/ManPageArchiveReadSetOptions3.wiki

doc/wiki/ManPageArchiveWriteSetOptions3.wiki

libarchive/archive_acl.c

libarchive/archive_acl_private.h

libarchive/archive_crypto.c

libarchive/archive_crypto_private.h

libarchive/archive_entry_acl.3

libarchive/archive_entry_linkify.3

libarchive/archive_entry_locale.h

libarchive/archive_entry_paths.3

libarchive/archive_entry_perms.3

libarchive/archive_entry_sparse.c

libarchive/archive_entry_stat.3

libarchive/archive_entry_time.3

libarchive/archive_options.c

libarchive/archive_options_private.h

libarchive/archive_ppmd7.c

libarchive/archive_ppmd7_private.h

libarchive/archive_ppmd_private.h

libarchive/archive_rb.c

libarchive/archive_rb.h

libarchive/archive_read_disk_posix.c

libarchive/archive_read_disk_windows.c

libarchive/archive_read_set_options.3

libarchive/archive_read_set_options.c

libarchive/archive_read_support_filter_all.c

libarchive/archive_read_support_filter_bzip2.c

libarchive/archive_read_support_filter_compress.c

libarchive/archive_read_support_filter_gzip.c

libarchive/archive_read_support_filter_none.c

libarchive/archive_read_support_filter_program.c

libarchive/archive_read_support_filter_rpm.c

libarchive/archive_read_support_filter_uu.c

libarchive/archive_read_support_filter_xz.c

libarchive/archive_read_support_format_7zip.c

libarchive/archive_read_support_format_by_code.c

libarchive/archive_read_support_format_cab.c

libarchive/archive_read_support_format_lha.c

libarchive/archive_read_support_format_rar.c

libarchive/archive_string_composition.h

libarchive/archive_write_add_filter_bzip2.c

libarchive/archive_write_add_filter_compress.c

libarchive/archive_write_add_filter_gzip.c

libarchive/archive_write_add_filter_none.c

libarchive/archive_write_add_filter_program.c

libarchive/archive_write_add_filter_xz.c

libarchive/archive_write_disk_posix.c

libarchive/archive_write_disk_windows.c

libarchive/archive_write_set_format_7zip.c

libarchive/archive_write_set_format_gnutar.c

libarchive/archive_write_set_format_iso9660.c

libarchive/archive_write_set_format_xar.c

libarchive/archive_write_set_options.3

libarchive/archive_write_set_options.c

libarchive/test/test_acl_nfs4.c

libarchive/test/test_acl_pax.tar.uu

libarchive/test/test_acl_posix1e.c

libarchive/test/test_archive_clear_error.c

libarchive/test/test_archive_crypto.c

libarchive/test/test_archive_read_close_twice.c

libarchive/test/test_archive_read_close_twice_open_fd.c

libarchive/test/test_archive_read_close_twice_open_filename.c

libarchive/test/test_archive_read_next_header_empty.c

libarchive/test/test_archive_read_next_header_raw.c

libarchive/test/test_archive_read_open2.c

libarchive/test/test_archive_read_set_filter_option.c

libarchive/test/test_archive_read_set_format_option.c

libarchive/test/test_archive_read_set_option.c

libarchive/test/test_archive_read_set_options.c

libarchive/test/test_archive_read_support.c

libarchive/test/test_archive_set_error.c

libarchive/test/test_archive_string.c

libarchive/test/test_archive_string_conversion.c

libarchive/test/test_archive_string_conversion.txt.Z.uu

libarchive/test/test_archive_write_set_filter_option.c

libarchive/test/test_archive_write_set_format_option.c

libarchive/test/test_archive_write_set_option.c

libarchive/test/test_archive_write_set_options.c

libarchive/test/test_compat_lzip.c

libarchive/test/test_compat_lzip_1.tlz.uu

libarchive/test/test_compat_lzip_2.tlz.uu

libarchive/test/test_compat_mac-1.tar.Z.uu

libarchive/test/test_compat_mac-2.tar.Z.uu

libarchive/test/test_compat_mac.c

libarchive/test/test_compat_pax_libarchive_2x.c

libarchive/test/test_compat_pax_libarchive_2x.tar.Z.uu

libarchive/test/test_compat_solaris_pax_sparse.c

libarchive/test/test_compat_solaris_pax_sparse_1.pax.Z.uu

libarchive/test/test_compat_solaris_pax_sparse_2.pax.Z.uu

libarchive/test/test_compat_zip_2.zip.uu

libarchive/test/test_compat_zip_3.zip.uu

libarchive/test/test_compat_zip_4.zip.uu

libarchive/test/test_compat_zip_5.zip.uu

libarchive/test/test_filter_count.c

libarchive/test/test_fuzz.cab.uu

libarchive/test/test_fuzz.lzh.uu

libarchive/test/test_gnutar_filename_encoding.c

libarchive/test/test_read_disk_directory_traversals.c

libarchive/test/test_read_format_7zip.c

libarchive/test/test_read_format_7zip_bcj2_bzip2.7z.uu

libarchive/test/test_read_format_7zip_bcj2_copy_1.7z.uu

libarchive/test/test_read_format_7zip_bcj2_copy_2.7z.uu

libarchive/test/test_read_format_7zip_bcj2_copy_lzma.7z.uu

libarchive/test/test_read_format_7zip_bcj2_deflate.7z.uu

libarchive/test/test_read_format_7zip_bcj2_lzma1_1.7z.uu

libarchive/test/test_read_format_7zip_bcj2_lzma1_2.7z.uu

libarchive/test/test_read_format_7zip_bcj2_lzma2_1.7z.uu

libarchive/test/test_read_format_7zip_bcj2_lzma2_2.7z.uu

libarchive/test/test_read_format_7zip_bcj_bzip2.7z.uu

libarchive/test/test_read_format_7zip_bcj_copy.7z.uu

libarchive/test/test_read_format_7zip_bcj_deflate.7z.uu

libarchive/test/test_read_format_7zip_bcj_lzma1.7z.uu

libarchive/test/test_read_format_7zip_bcj_lzma2.7z.uu

libarchive/test/test_read_format_7zip_bzip2.7z.uu

libarchive/test/test_read_format_7zip_copy.7z.uu

libarchive/test/test_read_format_7zip_copy_2.7z.uu

libarchive/test/test_read_format_7zip_deflate.7z.uu

libarchive/test/test_read_format_7zip_delta_lzma1.7z.uu

libarchive/test/test_read_format_7zip_delta_lzma2.7z.uu

libarchive/test/test_read_format_7zip_empty_archive.7z.uu

libarchive/test/test_read_format_7zip_empty_file.7z.uu

libarchive/test/test_read_format_7zip_lzma1.7z.uu

libarchive/test/test_read_format_7zip_lzma1_2.7z.uu

libarchive/test/test_read_format_7zip_lzma1_lzma2.7z.uu

libarchive/test/test_read_format_7zip_lzma2.7z.uu

libarchive/test/test_read_format_7zip_ppmd.7z.uu

libarchive/test/test_read_format_7zip_symbolic_name.7z.uu

libarchive/test/test_read_format_cab.c

libarchive/test/test_read_format_cab_1.cab.uu

libarchive/test/test_read_format_cab_2.cab.uu

libarchive/test/test_read_format_cab_3.cab.uu

libarchive/test/test_read_format_cab_filename.c

libarchive/test/test_read_format_cab_filename_cp932.cab.uu

libarchive/test/test_read_format_cpio_afio.c

libarchive/test/test_read_format_cpio_bin_lzip.c

libarchive/test/test_read_format_cpio_filename.c

libarchive/test/test_read_format_cpio_filename_cp866.cpio.uu

libarchive/test/test_read_format_cpio_filename_eucjp.cpio.uu

libarchive/test/test_read_format_cpio_filename_koi8r.cpio.uu

libarchive/test/test_read_format_cpio_filename_utf8_jp.cpio.uu

libarchive/test/test_read_format_cpio_filename_utf8_ru.cpio.uu

libarchive/test/test_read_format_gtar_filename.c

libarchive/test/test_read_format_gtar_filename_cp866.tar.Z.uu

libarchive/test/test_read_format_gtar_filename_eucjp.tar.Z.uu

libarchive/test/test_read_format_gtar_filename_koi8r.tar.Z.uu

libarchive/test/test_read_format_iso_joliet_by_nero.iso.Z.uu

libarchive/test/test_read_format_iso_xorriso.c

libarchive/test/test_read_format_iso_xorriso.iso.Z.uu

libarchive/test/test_read_format_isojoliet_versioned.c

libarchive/test/test_read_format_lha.c

libarchive/test/test_read_format_lha_filename.c

libarchive/test/test_read_format_lha_filename_cp932.lzh.uu

libarchive/test/test_read_format_lha_header0.lzh.uu

libarchive/test/test_read_format_lha_header1.lzh.uu

libarchive/test/test_read_format_lha_header2.lzh.uu

libarchive/test/test_read_format_lha_header3.lzh.uu

libarchive/test/test_read_format_lha_lh0.lzh.uu

libarchive/test/test_read_format_lha_lh6.lzh.uu

libarchive/test/test_read_format_lha_lh7.lzh.uu

libarchive/test/test_read_format_lha_withjunk.lzh.uu

libarchive/test/test_read_format_mtree_nomagic.mtree.uu

libarchive/test/test_read_format_rar.c

libarchive/test/test_read_format_rar.rar.uu

libarchive/test/test_read_format_rar_binary_data.rar.uu

libarchive/test/test_read_format_rar_compress_best.rar.uu

libarchive/test/test_read_format_rar_compress_normal.rar.uu

libarchive/test/test_read_format_rar_multi_lzss_blocks.rar.uu

libarchive/test/test_read_format_rar_noeof.rar.uu

libarchive/test/test_read_format_rar_ppmd_lzss_conversion.rar.uu

libarchive/test/test_read_format_rar_sfx.exe.uu

libarchive/test/test_read_format_rar_subblock.rar.uu

libarchive/test/test_read_format_rar_unicode.rar.uu

libarchive/test/test_read_format_rar_windows.rar.uu

libarchive/test/test_read_format_tar_filename.c

libarchive/test/test_read_format_tar_filename_koi8r.tar.Z.uu

libarchive/test/test_read_format_ustar_filename.c

libarchive/test/test_read_format_ustar_filename_cp866.tar.Z.uu

libarchive/test/test_read_format_ustar_filename_eucjp.tar.Z.uu

libarchive/test/test_read_format_ustar_filename_koi8r.tar.Z.uu

libarchive/test/test_read_format_zip_filename.c

libarchive/test/test_read_format_zip_filename_cp866.zip.uu

libarchive/test/test_read_format_zip_filename_cp932.zip.uu

libarchive/test/test_read_format_zip_filename_koi8r.zip.uu

libarchive/test/test_read_format_zip_filename_utf8_jp.zip.uu

libarchive/test/test_read_format_zip_filename_utf8_ru.zip.uu

libarchive/test/test_read_format_zip_filename_utf8_ru2.zip.uu

libarchive/test/test_read_format_zip_length_at_end.zip.uu

libarchive/test/test_read_format_zip_symlink.zip.uu

libarchive/test/test_read_format_zip_ux.zip.uu

libarchive/test/test_read_truncated_filter.c

libarchive/test/test_sparse_basic.c

libarchive/test/test_ustar_filename_encoding.c

libarchive/test/test_write_compress_lzip.c

libarchive/test/test_write_disk_lookup.c

libarchive/test/test_write_format_7zip.c

libarchive/test/test_write_format_gnutar.c

libarchive/test/test_write_format_iso9660.c

libarchive/test/test_write_format_iso9660_boot.c

libarchive/test/test_write_format_iso9660_empty.c

libarchive/test/test_write_format_iso9660_filename.c

libarchive/test/test_write_format_iso9660_zisofs.c

libarchive/test/test_write_format_mtree_fflags.c

libarchive/test/test_write_format_tar_sparse.c

libarchive/test/test_write_format_xar.c

libarchive/test/test_write_format_xar_empty.c

libarchive/test/test_zip_filename_encoding.c

tar/test/test_option_C_upper.c

tar/test/test_option_H_upper.c

tar/test/test_option_L_upper.c

tar/test/test_option_O_upper.c

tar/test/test_option_U_upper.c

tar/test/test_option_X_upper.c

tar/test/test_option_b.c

tar/test/test_option_exclude.c

tar/test/test_option_gid_gname.c

tar/test/test_option_k.c

tar/test/test_option_keep_newer_files.c

tar/test/test_option_keep_newer_files.tar.Z.uu

tar/test/test_option_n.c

tar/test/test_option_newer_than.c

tar/test/test_option_s.tar.Z.uu

tar/test/test_option_uid_uname.c

tar/test/test_print_longpath.c

tar/test/test_print_longpath.tar.Z.uu

files removed:
.pc

.pc/.version

.pc/0001-Patch-from-upstream-revision-1990.patch

.pc/0001-Patch-from-upstream-revision-1990.patch/libarchive

.pc/0001-Patch-from-upstream-revision-1990.patch/libarchive/archive_read_disk_entry_from_file.c

.pc/0002-Patch-from-upstream-revision-1991.patch

.pc/0002-Patch-from-upstream-revision-1991.patch/libarchive

.pc/0002-Patch-from-upstream-revision-1991.patch/libarchive/archive_write_disk.c

.pc/0003-Patch-from-upstream-rev-2514.patch

.pc/0003-Patch-from-upstream-rev-2514.patch/libarchive

.pc/0003-Patch-from-upstream-rev-2514.patch/libarchive/archive_read_support_format_iso9660.c

.pc/0004-Patch-from-upstream-rev-2520.patch

.pc/0004-Patch-from-upstream-rev-2520.patch/libarchive

.pc/0004-Patch-from-upstream-rev-2520.patch/libarchive/archive_read_support_format_iso9660.c

.pc/0005-Patch-from-upstream-rev-2521.patch

.pc/0005-Patch-from-upstream-rev-2521.patch/libarchive

.pc/0005-Patch-from-upstream-rev-2521.patch/libarchive/archive_read_support_format_iso9660.c

.pc/0006-Patch-from-upstream-rev-2537.patch

.pc/0006-Patch-from-upstream-rev-2537.patch/libarchive

.pc/0006-Patch-from-upstream-rev-2537.patch/libarchive/archive_read_disk_entry_from_file.c

.pc/0006-Patch-from-upstream-rev-2537.patch/libarchive/archive_write_disk.c

.pc/0007-Patch-from-upstream-rev-2888.patch

.pc/0007-Patch-from-upstream-rev-2888.patch/libarchive

.pc/0007-Patch-from-upstream-rev-2888.patch/libarchive/archive_write_disk.c

.pc/0008-Patch-from-upstream-rev-2940.patch

.pc/0008-Patch-from-upstream-rev-2940.patch/libarchive

.pc/0008-Patch-from-upstream-rev-2940.patch/libarchive/archive_read_support_format_iso9660.c

.pc/0009-Patch-from-upstream-rev-3751.patch

.pc/0009-Patch-from-upstream-rev-3751.patch/cpio

.pc/0009-Patch-from-upstream-rev-3751.patch/cpio/cpio.c

.pc/0009-Patch-from-upstream-rev-3751.patch/libarchive

.pc/0009-Patch-from-upstream-rev-3751.patch/libarchive/test

.pc/0009-Patch-from-upstream-rev-3751.patch/libarchive/test/main.c

.pc/0009-Patch-from-upstream-rev-3751.patch/tar

.pc/0009-Patch-from-upstream-rev-3751.patch/tar/bsdtar.c

.pc/CVE-2011-1777.patch

.pc/CVE-2011-1777.patch/libarchive

.pc/CVE-2011-1777.patch/libarchive/archive_read_support_format_iso9660.c

.pc/CVE-2011-1778.patch

.pc/CVE-2011-1778.patch/libarchive

.pc/CVE-2011-1778.patch/libarchive/archive_read_support_format_tar.c

.pc/applied-patches

.pc/autoreconf.patch

.pc/autoreconf.patch/Makefile.in

.pc/autoreconf.patch/aclocal.m4

.pc/autoreconf.patch/build

.pc/autoreconf.patch/build/autoconf

.pc/autoreconf.patch/build/autoconf/config.guess

.pc/autoreconf.patch/build/autoconf/config.sub

.pc/autoreconf.patch/build/autoconf/libtool.m4

.pc/autoreconf.patch/build/autoconf/ltmain.sh

.pc/autoreconf.patch/build/autoconf/ltoptions.m4

.pc/autoreconf.patch/build/autoconf/ltsugar.m4

.pc/autoreconf.patch/build/autoconf/ltversion.m4

.pc/autoreconf.patch/build/autoconf/lt~obsolete.m4

.pc/autoreconf.patch/configure

.pc/bsdcpio_test_typo.diff

.pc/bsdcpio_test_typo.diff/cpio

.pc/bsdcpio_test_typo.diff/cpio/test

.pc/bsdcpio_test_typo.diff/cpio/test/test_option_c.c

build/autoconf/libtool.m4

build/autoconf/ltoptions.m4

build/autoconf/ltsugar.m4

build/autoconf/ltversion.m4

build/autoconf/lt~obsolete.m4

build/cmake/AddTest28.cmake

debian/libarchive1.install

debian/libarchive1.lintian-overrides

debian/libarchive1.symbols

debian/patches

debian/patches/0001-Patch-from-upstream-revision-1990.patch

debian/patches/0002-Patch-from-upstream-revision-1991.patch

debian/patches/0003-Patch-from-upstream-rev-2514.patch

debian/patches/0004-Patch-from-upstream-rev-2520.patch

debian/patches/0005-Patch-from-upstream-rev-2521.patch

debian/patches/0006-Patch-from-upstream-rev-2537.patch

debian/patches/0007-Patch-from-upstream-rev-2888.patch

debian/patches/0008-Patch-from-upstream-rev-2940.patch

debian/patches/0009-Patch-from-upstream-rev-3751.patch

debian/patches/CVE-2011-1777.patch

debian/patches/CVE-2011-1778.patch

debian/patches/autoreconf.patch

debian/patches/bsdcpio_test_typo.diff

debian/patches/series

examples/minitar/tree.c

examples/minitar/tree.h

libarchive/archive_hash.h

libarchive/archive_read_disk.c

libarchive/archive_read_support_compression_all.c

libarchive/archive_read_support_compression_bzip2.c

libarchive/archive_read_support_compression_compress.c

libarchive/archive_read_support_compression_gzip.c

libarchive/archive_read_support_compression_none.c

libarchive/archive_read_support_compression_program.c

libarchive/archive_read_support_compression_rpm.c

libarchive/archive_read_support_compression_uu.c

libarchive/archive_read_support_compression_xz.c

libarchive/archive_write_disk.c

libarchive/archive_write_set_compression_bzip2.c

libarchive/archive_write_set_compression_compress.c

libarchive/archive_write_set_compression_gzip.c

libarchive/archive_write_set_compression_none.c

libarchive/archive_write_set_compression_program.c

libarchive/archive_write_set_compression_xz.c

libarchive/test/test_acl_basic.c

files modified:
CMakeLists.txt

COPYING

INSTALL

Makefile.am

Makefile.in

NEWS

README

aclocal.m4

build/autoconf/config.guess

build/autoconf/config.sub

build/autoconf/ltmain.sh *

build/autogen.sh

build/clean.sh

build/cmake/FindLZMA.cmake

build/cmake/config.h.in

build/pkgconfig/libarchive.pc.in

build/version

config.h.in

configure

configure.ac

contrib/README

contrib/libarchive.1aix53.spec

contrib/libarchive.spec

contrib/psota-benchmark/results.txt

contrib/psota-benchmark/tcp.sh

contrib/shar/shar.c

contrib/untar.c

cpio/bsdcpio.1

cpio/cmdline.c

cpio/cpio.c

cpio/cpio.h

cpio/test/CMakeLists.txt

cpio/test/list.h

cpio/test/main.c

cpio/test/test.h

cpio/test/test_0.c

cpio/test/test_basic.c

cpio/test/test_format_newc.c

cpio/test/test_option_c.c

cpio/test/test_option_t.c

cpio/test/test_option_u.c

cpio/test/test_owner_parse.c

debian/changelog

debian/control

debian/copyright

debian/gbp.conf

debian/rules

debian/watch

doc/html/Makefile

doc/man/Makefile

doc/man/archive_entry.3

doc/man/archive_read.3

doc/man/archive_read_disk.3

doc/man/archive_util.3

doc/man/archive_write.3

doc/man/archive_write_disk.3

doc/man/bsdcpio.1

doc/man/bsdtar.1

doc/man/cpio.5

doc/man/libarchive-formats.5

doc/man/libarchive.3

doc/man/libarchive_internals.3

doc/man/tar.5

doc/mdoc2wiki.awk

doc/pdf/Makefile

doc/pdf/archive_entry.3.pdf

doc/pdf/archive_read.3.pdf

doc/pdf/archive_read_disk.3.pdf

doc/pdf/archive_util.3.pdf

doc/pdf/archive_write.3.pdf

doc/pdf/archive_write_disk.3.pdf

doc/pdf/bsdcpio.1.pdf

doc/pdf/bsdtar.1.pdf

doc/pdf/cpio.5.pdf

doc/pdf/libarchive-formats.5.pdf

doc/pdf/libarchive.3.pdf

doc/pdf/libarchive_internals.3.pdf

doc/pdf/mtree.5.pdf

doc/pdf/tar.5.pdf

doc/text/Makefile

doc/text/archive_entry.3.txt

doc/text/archive_read.3.txt

doc/text/archive_read_disk.3.txt

doc/text/archive_util.3.txt

doc/text/archive_write.3.txt

doc/text/archive_write_disk.3.txt

doc/text/bsdcpio.1.txt

doc/text/bsdtar.1.txt

doc/text/cpio.5.txt

doc/text/libarchive-formats.5.txt

doc/text/libarchive.3.txt

doc/text/libarchive_internals.3.txt

doc/text/mtree.5.txt

doc/text/tar.5.txt

doc/wiki/Makefile

doc/wiki/ManPageArchiveEntry3.wiki

doc/wiki/ManPageArchiveRead3.wiki

doc/wiki/ManPageArchiveReadDisk3.wiki

doc/wiki/ManPageArchiveUtil3.wiki

doc/wiki/ManPageArchiveWrite3.wiki

doc/wiki/ManPageArchiveWriteDisk3.wiki

doc/wiki/ManPageBsdcpio1.wiki

doc/wiki/ManPageBsdtar1.wiki

doc/wiki/ManPageCpio5.wiki

doc/wiki/ManPageLibarchive3.wiki

doc/wiki/ManPageLibarchiveFormats5.wiki

doc/wiki/ManPageLibarchiveInternals3.wiki

doc/wiki/ManPageTar5.wiki

examples/minitar/minitar.c

examples/tarfilter.c

examples/untar.c

libarchive/CMakeLists.txt

libarchive/archive.h

libarchive/archive_check_magic.c

libarchive/archive_crc32.h

libarchive/archive_entry.3

libarchive/archive_entry.c

libarchive/archive_entry.h

libarchive/archive_entry_copy_bhfi.c

libarchive/archive_entry_copy_stat.c

libarchive/archive_entry_link_resolver.c

libarchive/archive_entry_private.h

libarchive/archive_entry_stat.c

libarchive/archive_private.h

libarchive/archive_read.3

libarchive/archive_read.c

libarchive/archive_read_data_into_fd.c

libarchive/archive_read_disk.3

libarchive/archive_read_disk_entry_from_file.c

libarchive/archive_read_disk_private.h

libarchive/archive_read_disk_set_standard_lookup.c

libarchive/archive_read_extract.c

libarchive/archive_read_open_fd.c

libarchive/archive_read_open_file.c

libarchive/archive_read_open_filename.c

libarchive/archive_read_open_memory.c

libarchive/archive_read_private.h

libarchive/archive_read_support_format_all.c

libarchive/archive_read_support_format_ar.c

libarchive/archive_read_support_format_cpio.c

libarchive/archive_read_support_format_empty.c

libarchive/archive_read_support_format_iso9660.c

libarchive/archive_read_support_format_mtree.c

libarchive/archive_read_support_format_raw.c

libarchive/archive_read_support_format_tar.c

libarchive/archive_read_support_format_xar.c

libarchive/archive_read_support_format_zip.c

libarchive/archive_string.c

libarchive/archive_string.h

libarchive/archive_string_sprintf.c

libarchive/archive_util.3

libarchive/archive_util.c

libarchive/archive_virtual.c

libarchive/archive_windows.c

libarchive/archive_windows.h

libarchive/archive_write.3

libarchive/archive_write.c

libarchive/archive_write_disk.3

libarchive/archive_write_disk_set_standard_lookup.c

libarchive/archive_write_open_filename.c

libarchive/archive_write_open_memory.c

libarchive/archive_write_private.h

libarchive/archive_write_set_format.c

libarchive/archive_write_set_format_ar.c

libarchive/archive_write_set_format_by_name.c

libarchive/archive_write_set_format_cpio.c

libarchive/archive_write_set_format_cpio_newc.c

libarchive/archive_write_set_format_mtree.c

libarchive/archive_write_set_format_pax.c

libarchive/archive_write_set_format_shar.c

libarchive/archive_write_set_format_ustar.c

libarchive/archive_write_set_format_zip.c

libarchive/config_freebsd.h

libarchive/cpio.5

libarchive/filter_fork_windows.c

libarchive/libarchive-formats.5

libarchive/libarchive.3

libarchive/libarchive_internals.3

libarchive/tar.5

libarchive/test/CMakeLists.txt

libarchive/test/list.h

libarchive/test/main.c

libarchive/test/read_open_memory.c

libarchive/test/test.h

libarchive/test/test_acl_freebsd.c

libarchive/test/test_acl_pax.c

libarchive/test/test_archive_api_feature.c

libarchive/test/test_bad_fd.c

libarchive/test/test_compat_bzip2.c

libarchive/test/test_compat_cpio.c

libarchive/test/test_compat_gtar.c

libarchive/test/test_compat_gzip.c

libarchive/test/test_compat_lzma.c

libarchive/test/test_compat_solaris_tar_acl.c

libarchive/test/test_compat_tar_hardlink.c

libarchive/test/test_compat_xz.c

libarchive/test/test_compat_zip.c

libarchive/test/test_empty_write.c

libarchive/test/test_entry.c

libarchive/test/test_extattr_freebsd.c

libarchive/test/test_fuzz.c

libarchive/test/test_open_failure.c

libarchive/test/test_open_fd.c

libarchive/test/test_open_file.c

libarchive/test/test_open_filename.c

libarchive/test/test_pax_filename_encoding.c

libarchive/test/test_read_compress_program.c

libarchive/test/test_read_data_large.c

libarchive/test/test_read_disk.c

libarchive/test/test_read_disk_entry_from_file.c

libarchive/test/test_read_extract.c

libarchive/test/test_read_file_nonexistent.c

libarchive/test/test_read_format_ar.c

libarchive/test/test_read_format_cpio_bin.c

libarchive/test/test_read_format_cpio_bin_Z.c

libarchive/test/test_read_format_cpio_bin_be.c

libarchive/test/test_read_format_cpio_bin_bz2.c

libarchive/test/test_read_format_cpio_bin_gz.c

libarchive/test/test_read_format_cpio_bin_lzma.c

libarchive/test/test_read_format_cpio_bin_xz.c

libarchive/test/test_read_format_cpio_odc.c

libarchive/test/test_read_format_cpio_svr4_bzip2_rpm.c

libarchive/test/test_read_format_cpio_svr4_gzip.c

libarchive/test/test_read_format_cpio_svr4_gzip_rpm.c

libarchive/test/test_read_format_cpio_svr4c_Z.c

libarchive/test/test_read_format_empty.c

libarchive/test/test_read_format_gtar_gz.c

libarchive/test/test_read_format_gtar_lzma.c

libarchive/test/test_read_format_gtar_sparse.c

libarchive/test/test_read_format_iso_Z.c

libarchive/test/test_read_format_iso_multi_extent.c

libarchive/test/test_read_format_isojoliet_bz2.c

libarchive/test/test_read_format_isojoliet_long.c

libarchive/test/test_read_format_isojoliet_rr.c

libarchive/test/test_read_format_isorr_bz2.c

libarchive/test/test_read_format_isorr_ce.c

libarchive/test/test_read_format_isorr_new_bz2.c

libarchive/test/test_read_format_isorr_rr_moved.c

libarchive/test/test_read_format_isozisofs_bz2.c

libarchive/test/test_read_format_mtree.c

libarchive/test/test_read_format_mtree.mtree.uu

libarchive/test/test_read_format_pax_bz2.c

libarchive/test/test_read_format_raw.c

libarchive/test/test_read_format_tar.c

libarchive/test/test_read_format_tar_empty_filename.c

libarchive/test/test_read_format_tbz.c

libarchive/test/test_read_format_tgz.c

libarchive/test/test_read_format_tlz.c

libarchive/test/test_read_format_txz.c

libarchive/test/test_read_format_tz.c

libarchive/test/test_read_format_xar.c

libarchive/test/test_read_format_zip.c

libarchive/test/test_read_format_zip.zip.uu

libarchive/test/test_read_large.c

libarchive/test/test_read_pax_truncated.c

libarchive/test/test_read_position.c

libarchive/test/test_read_truncated.c

libarchive/test/test_read_uu.c

libarchive/test/test_tar_filenames.c

libarchive/test/test_tar_large.c

libarchive/test/test_ustar_filenames.c

libarchive/test/test_write_compress.c

libarchive/test/test_write_compress_bzip2.c

libarchive/test/test_write_compress_gzip.c

libarchive/test/test_write_compress_lzma.c

libarchive/test/test_write_compress_program.c

libarchive/test/test_write_compress_xz.c

libarchive/test/test_write_disk.c

libarchive/test/test_write_disk_failures.c

libarchive/test/test_write_disk_hardlink.c

libarchive/test/test_write_disk_perms.c

libarchive/test/test_write_disk_secure.c

libarchive/test/test_write_disk_sparse.c

libarchive/test/test_write_disk_symlink.c

libarchive/test/test_write_disk_times.c

libarchive/test/test_write_format_ar.c

libarchive/test/test_write_format_cpio.c

libarchive/test/test_write_format_cpio_empty.c

libarchive/test/test_write_format_cpio_newc.c

libarchive/test/test_write_format_cpio_odc.c

libarchive/test/test_write_format_mtree.c

libarchive/test/test_write_format_pax.c

libarchive/test/test_write_format_shar_empty.c

libarchive/test/test_write_format_tar.c

libarchive/test/test_write_format_tar_empty.c

libarchive/test/test_write_format_tar_ustar.c

libarchive/test/test_write_format_zip.c

libarchive/test/test_write_format_zip_empty.c

libarchive/test/test_write_format_zip_no_compression.c

libarchive/test/test_write_open_memory.c

libarchive_fe/line_reader.c

libarchive_fe/matching.c

libarchive_fe/pathmatch.c

tar/CMakeLists.txt

tar/bsdtar.1

tar/bsdtar.c

tar/bsdtar.h

tar/bsdtar_platform.h

tar/bsdtar_windows.h

tar/cmdline.c

tar/getdate.c

tar/read.c

tar/subst.c

tar/test/CMakeLists.txt

tar/test/list.h

tar/test/main.c

tar/test/test.h

tar/test/test_0.c

tar/test/test_basic.c

tar/test/test_option_T_upper.c

tar/test/test_option_q.c

tar/test/test_option_r.c

tar/test/test_option_s.c

tar/test/test_patterns.c

tar/test/test_strip_components.c

tar/test/test_symlink_dir.c

tar/tree.c

tar/util.c

tar/write.c

Show diffs side-by-side

added added

removed removed

libarchive/archive_string.c

/*-

* Redistribution and use in source and binary forms, with or without

* Basic resizable string support, to simplify manipulating arbitrary-sized

* strings while minimizing heap activity.

* In particular, the buffer used by a string object is only grown, it

* never shrinks, so you can clear and reuse the same string object

* without incurring additional memory allocations.

#ifdef HAVE_ERRNO_H

#include <errno.h>

#endif

#ifdef HAVE_ICONV_H

#include <iconv.h>

#endif

#ifdef HAVE_LANGINFO_H

#include <langinfo.h>

#endif

#ifdef HAVE_LOCALCHARSET_H

#include <localcharset.h>

#endif

#ifdef HAVE_STDLIB_H

#include <stdlib.h>

#endif

#if defined(_WIN32) && !defined(__CYGWIN__)

#include <windows.h>

#include <locale.h>

#endif

#if defined(__APPLE__)

#include <CoreServices/CoreServices.h>

#endif

#include "archive_endian.h"

#include "archive_private.h"

#include "archive_string.h"

struct archive_string *

__archive_string_append(struct archive_string *as, const char *p, size_t s)

#include "archive_string_composition.h"

#if !defined(HAVE_WMEMCPY) && !defined(wmemcpy)

#define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t))

#endif

struct archive_string_conv {

struct archive_string_conv *next;

char *from_charset;

char *to_charset;

unsigned from_cp;

unsigned to_cp;

/* Set 1 if from_charset and to_charset are the same. */

int same;

int flag;

#define SCONV_TO_CHARSET 1 /* MBS is being converted to specified

* charset. */

#define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from

* specified charset. */

#define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */

#define SCONV_WIN_CP (1<<3) /* Use Windows API for converting

* MBS. */

#define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive

* 2.x in the wrong assumption. */

#define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C.

* Before UTF-8 characters are actually

* processed. */

#define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D.

* Before UTF-8 characters are actually

100

* processed.

101

* Currently this only for MAC OS X. */

102

#define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */

103

#define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */

104

#define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */

105

#define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */

106

#define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */

107

#define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */

108

#define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE)

109

#define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE)

110

111

#if HAVE_ICONV

112

iconv_t cd;

113

iconv_t cd_w;/* Use at archive_mstring on

114

* Windows. */

115

#endif

116

/* A temporary buffer for normalization. */

117

struct archive_string utftmp;

118

#if defined(__APPLE__)

119

UnicodeToTextInfo uniInfo;

120

struct archive_string utf16nfc;

121

struct archive_string utf16nfd;

122

#endif

123

int (*converter[2])(struct archive_string *, const void *, size_t,

124

struct archive_string_conv *);

125

int nconverter;

126

};

127

128

#define CP_C_LOCALE 0 /* "C" locale only for this file. */

129

#define CP_UTF16LE 1200

130

#define CP_UTF16BE 1201

131

132

#define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF)

133

#define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF)

134

#define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF)

135

#define UNICODE_MAX 0x10FFFF

136

#define UNICODE_R_CHAR 0xFFFD /* Replacement character. */

137

/* Set U+FFFD(Replacement character) in UTF-8. */

138

#define UTF8_SET_R_CHAR(outp) do { \

139

(outp)[0] = 0xef; \

140

(outp)[1] = 0xbf; \

141

(outp)[2] = 0xbd; \

142

} while (0)

143

#define UTF8_R_CHAR_SIZE 3

144

145

static struct archive_string_conv *find_sconv_object(struct archive *,

146

const char *, const char *);

147

static void add_sconv_object(struct archive *, struct archive_string_conv *);

148

static struct archive_string_conv *create_sconv_object(const char *,

149

const char *, unsigned, int);

150

static void free_sconv_object(struct archive_string_conv *);

151

static struct archive_string_conv *get_sconv_object(struct archive *,

152

const char *, const char *, int);

153

static unsigned make_codepage_from_charset(const char *);

154

static unsigned get_current_codepage(void);

155

static unsigned get_current_oemcp(void);

156

static size_t mbsnbytes(const void *, size_t);

157

static size_t utf16nbytes(const void *, size_t);

158

#if defined(_WIN32) && !defined(__CYGWIN__)

159

static int archive_wstring_append_from_mbs_in_codepage(

160

struct archive_wstring *, const char *, size_t,

161

struct archive_string_conv *);

162

static int archive_string_append_from_wcs_in_codepage(struct archive_string *,

163

const wchar_t *, size_t, struct archive_string_conv *);

164

static int is_big_endian(void);

165

static int strncat_in_codepage(struct archive_string *, const void *,

166

size_t, struct archive_string_conv *);

167

static int win_strncat_from_utf16be(struct archive_string *, const void *, size_t,

168

struct archive_string_conv *);

169

static int win_strncat_from_utf16le(struct archive_string *, const void *, size_t,

170

struct archive_string_conv *);

171

static int win_strncat_to_utf16be(struct archive_string *, const void *, size_t,

172

struct archive_string_conv *);

173

static int win_strncat_to_utf16le(struct archive_string *, const void *, size_t,

174

struct archive_string_conv *);

175

#endif

176

static int best_effort_strncat_from_utf16be(struct archive_string *, const void *,

177

size_t, struct archive_string_conv *);

178

static int best_effort_strncat_from_utf16le(struct archive_string *, const void *,

179

size_t, struct archive_string_conv *);

180

static int best_effort_strncat_to_utf16be(struct archive_string *, const void *,

181

size_t, struct archive_string_conv *);

182

static int best_effort_strncat_to_utf16le(struct archive_string *, const void *,

183

size_t, struct archive_string_conv *);

184

#if defined(HAVE_ICONV)

185

static int iconv_strncat_in_locale(struct archive_string *, const void *,

186

size_t, struct archive_string_conv *);

187

#endif

188

static int best_effort_strncat_in_locale(struct archive_string *, const void *,

189

size_t, struct archive_string_conv *);

190

static int _utf8_to_unicode(uint32_t *, const char *, size_t);

191

static int utf8_to_unicode(uint32_t *, const char *, size_t);

192

static inline uint32_t combine_surrogate_pair(uint32_t, uint32_t);

193

static int cesu8_to_unicode(uint32_t *, const char *, size_t);

194

static size_t unicode_to_utf8(char *, size_t, uint32_t);

195

static int utf16_to_unicode(uint32_t *, const char *, size_t, int);

196

static size_t unicode_to_utf16be(char *, size_t, uint32_t);

197

static size_t unicode_to_utf16le(char *, size_t, uint32_t);

198

static int strncat_from_utf8_libarchive2(struct archive_string *,

199

const void *, size_t, struct archive_string_conv *);

200

static int strncat_from_utf8_to_utf8(struct archive_string *, const void *,

201

size_t, struct archive_string_conv *);

202

static int archive_string_normalize_C(struct archive_string *, const void *,

203

size_t, struct archive_string_conv *);

204

#if defined(__APPLE__)

205

static int archive_string_normalize_D(struct archive_string *, const void *,

206

size_t, struct archive_string_conv *);

207

#endif

208

static int archive_string_append_unicode(struct archive_string *,

209

const void *, size_t, struct archive_string_conv *);

210

211

static struct archive_string *

212

archive_string_append(struct archive_string *as, const char *p, size_t s)

213

{

if (__archive_string_ensure(as, as->length + s + 1) == NULL)

__archive_errx(1, "Out of memory");

214

if (archive_string_ensure(as, as->length + s + 1) == NULL)

215

return (NULL);

216

memcpy(as->s + as->length, p, s);

as->s[as->length + s] = 0;

as->length += s;

return (as);

}

void

__archive_string_copy(struct archive_string *dest, struct archive_string *src)

{

if (src->length == 0)

dest->length = 0;

else {

if (__archive_string_ensure(dest, src->length + 1) == NULL)

__archive_errx(1, "Out of memory");

memcpy(dest->s, src->s, src->length);

dest->length = src->length;

dest->s[dest->length] = 0;

}

void

__archive_string_concat(struct archive_string *dest, struct archive_string *src)

{

if (src->length > 0) {

if (__archive_string_ensure(dest, dest->length + src->length + 1) == NULL)

__archive_errx(1, "Out of memory");

memcpy(dest->s + dest->length, src->s, src->length);

dest->length += src->length;

dest->s[dest->length] = 0;

}

void

__archive_string_free(struct archive_string *as)

{

as->length = 0;

as->buffer_length = 0;

if (as->s != NULL) {

free(as->s);

as->s = NULL;

}

217

as->length += s;

218

as->s[as->length] = 0;

219

return (as);

220

}

221

222

static struct archive_wstring *

223

archive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s)

224

{

225

if (archive_wstring_ensure(as, as->length + s + 1) == NULL)

226

return (NULL);

227

wmemcpy(as->s + as->length, p, s);

228

as->length += s;

229

as->s[as->length] = 0;

230

return (as);

231

}

232

233

void

234

archive_string_concat(struct archive_string *dest, struct archive_string *src)

235

{

236

if (archive_string_append(dest, src->s, src->length) == NULL)

237

__archive_errx(1, "Out of memory");

238

}

239

240

void

241

archive_wstring_concat(struct archive_wstring *dest, struct archive_wstring *src)

242

{

243

if (archive_wstring_append(dest, src->s, src->length) == NULL)

244

__archive_errx(1, "Out of memory");

245

}

246

247

void

248

archive_string_free(struct archive_string *as)

249

{

250

as->length = 0;

251

as->buffer_length = 0;

252

free(as->s);

253

as->s = NULL;

254

}

255

256

void

257

archive_wstring_free(struct archive_wstring *as)

258

{

259

as->length = 0;

260

as->buffer_length = 0;

261

free(as->s);

262

as->s = NULL;

263

}

264

265

struct archive_wstring *

266

archive_wstring_ensure(struct archive_wstring *as, size_t s)

267

{

268

return (struct archive_wstring *)

269

archive_string_ensure((struct archive_string *)as,

270

s * sizeof(wchar_t));

271

}

272

273

/* Returns NULL on any allocation failure. */

274

struct archive_string *

100

__archive_string_ensure(struct archive_string *as, size_t s)

275

archive_string_ensure(struct archive_string *as, size_t s)

101

276

{

277

char *p;

278

size_t new_length;

279

102

280

/* If buffer is already big enough, don't reallocate. */

103

281

if (as->s && (s <= as->buffer_length))

104

282

return (as);

112

290

113

291

if (as->buffer_length < 32)

114

292

/* Start with a minimum 32-character buffer. */

115

as->buffer_length = 32;

293

new_length = 32;

116

294

else if (as->buffer_length < 8192)

117

295

/* Buffers under 8k are doubled for speed. */

118

as->buffer_length += as->buffer_length;

296

new_length = as->buffer_length + as->buffer_length;

119

297

else {

120

298

/* Buffers 8k and over grow by at least 25% each time. */

121

size_t old_length = as->buffer_length;

122

as->buffer_length += as->buffer_length / 4;

123

/* Be safe: If size wraps, release buffer and return NULL. */

124

if (as->buffer_length < old_length) {

125

free(as->s);

126

as->s = NULL;

299

new_length = as->buffer_length + as->buffer_length / 4;

300

/* Be safe: If size wraps, fail. */

301

if (new_length < as->buffer_length) {

302

/* On failure, wipe the string and return NULL. */

303

archive_string_free(as);

304

errno = ENOMEM;/* Make sure errno has ENOMEM. */

127

305

return (NULL);

128

306

}

129

307

}

132

310

* grow the buffer. In any case, we have to grow it enough to

133

311

* hold the request.

134

312

135

if (as->buffer_length < s)

136

as->buffer_length = s;

313

if (new_length < s)

314

new_length = s;

137

315

/* Now we can reallocate the buffer. */

138

as->s = (char *)realloc(as->s, as->buffer_length);

139

if (as->s == NULL)

316

p = (char *)realloc(as->s, new_length);

317

if (p == NULL) {

318

/* On failure, wipe the string and return NULL. */

319

archive_string_free(as);

320

errno = ENOMEM;/* Make sure errno has ENOMEM. */

140

321

return (NULL);

322

}

323

324

as->s = p;

325

as->buffer_length = new_length;

141

326

return (as);

142

327

}

143

328

329

330

* TODO: See if there's a way to avoid scanning

331

* the source string twice. Then test to see

332

* if it actually helps (remember that we're almost

333

* always called with pretty short arguments, so

334

* such an optimization might not help).

335

144

336

struct archive_string *

145

__archive_strncat(struct archive_string *as, const void *_p, size_t n)

337

archive_strncat(struct archive_string *as, const void *_p, size_t n)

146

338

{

147

339

size_t s;

148

340

const char *p, *pp;

152

344

/* Like strlen(p), except won't examine positions beyond p[n]. */

153

345

s = 0;

154

346

pp = p;

155

while (*pp && s < n) {

156

pp++;

157

s++;

158

}

159

return (__archive_string_append(as, p, s));

160

}

161

162

struct archive_string *

163

__archive_strappend_char(struct archive_string *as, char c)

164

{

165

return (__archive_string_append(as, &c, 1));

166

}

167

168

169

* Translates a wide character string into UTF-8 and appends

170

* to the archive_string. Note: returns NULL if conversion fails,

171

* but still leaves a best-effort conversion in the argument as.

172

173

struct archive_string *

174

__archive_strappend_w_utf8(struct archive_string *as, const wchar_t *w)

175

{

176

char *p;

177

unsigned wc;

178

char buff[256];

179

struct archive_string *return_val = as;

180

181

182

* Convert one wide char at a time into 'buff', whenever that

183

* fills, append it to the string.

347

while (s < n && *pp) {

348

pp++;

349

s++;

350

}

351

if ((as = archive_string_append(as, p, s)) == NULL)

352

__archive_errx(1, "Out of memory");

353

return (as);

354

}

355

356

struct archive_wstring *

357

archive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n)

358

{

359

size_t s;

360

const wchar_t *pp;

361

362

/* Like strlen(p), except won't examine positions beyond p[n]. */

363

s = 0;

364

pp = p;

365

while (s < n && *pp) {

366

pp++;

367

s++;

368

}

369

if ((as = archive_wstring_append(as, p, s)) == NULL)

370

__archive_errx(1, "Out of memory");

371

return (as);

372

}

373

374

struct archive_string *

375

archive_strcat(struct archive_string *as, const void *p)

376

{

377

/* strcat is just strncat without an effective limit.

378

* Assert that we'll never get called with a source

379

* string over 16MB.

380

* TODO: Review all uses of strcat in the source

381

* and try to replace them with strncat().

184

382

185

p = buff;

186

while (*w != L'\0') {

187

/* Flush the buffer when we have <=16 bytes free. */

188

/* (No encoding has a single character >16 bytes.) */

189

if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - 16)) {

190

*p = '\0';

191

archive_strcat(as, buff);

192

p = buff;

193

}

194

wc = *w++;

195

/* If this is a surrogate pair, assemble the full code point.*/

196

/* Note: wc must not be wchar_t here, because the full code

197

* point can be more than 16 bits! */

198

if (wc >= 0xD800 && wc <= 0xDBff

199

&& *w >= 0xDC00 && *w <= 0xDFFF) {

200

wc -= 0xD800;

201

wc *= 0x400;

202

wc += (*w - 0xDC00);

203

wc += 0x10000;

204

++w;

205

}

206

/* Translate code point to UTF8 */

207

if (wc <= 0x7f) {

208

*p++ = (char)wc;

209

} else if (wc <= 0x7ff) {

210

*p++ = 0xc0 | ((wc >> 6) & 0x1f);

211

*p++ = 0x80 | (wc & 0x3f);

212

} else if (wc <= 0xffff) {

213

*p++ = 0xe0 | ((wc >> 12) & 0x0f);

214

*p++ = 0x80 | ((wc >> 6) & 0x3f);

215

*p++ = 0x80 | (wc & 0x3f);

216

} else if (wc <= 0x1fffff) {

217

*p++ = 0xf0 | ((wc >> 18) & 0x07);

218

*p++ = 0x80 | ((wc >> 12) & 0x3f);

219

*p++ = 0x80 | ((wc >> 6) & 0x3f);

220

*p++ = 0x80 | (wc & 0x3f);

383

return archive_strncat(as, p, 0x1000000);

384

}

385

386

struct archive_wstring *

387

archive_wstrcat(struct archive_wstring *as, const wchar_t *p)

388

{

389

/* Ditto. */

390

return archive_wstrncat(as, p, 0x1000000);

391

}

392

393

struct archive_string *

394

archive_strappend_char(struct archive_string *as, char c)

395

{

396

if ((as = archive_string_append(as, &c, 1)) == NULL)

397

__archive_errx(1, "Out of memory");

398

return (as);

399

}

400

401

struct archive_wstring *

402

archive_wstrappend_wchar(struct archive_wstring *as, wchar_t c)

403

{

404

if ((as = archive_wstring_append(as, &c, 1)) == NULL)

405

__archive_errx(1, "Out of memory");

406

return (as);

407

}

408

409

410

* Get the "current character set" name to use with iconv.

411

* On FreeBSD, the empty character set name "" chooses

412

* the correct character encoding for the current locale,

413

* so this isn't necessary.

414

* But iconv on Mac OS 10.6 doesn't seem to handle this correctly;

415

* on that system, we have to explicitly call nl_langinfo()

416

* to get the right name. Not sure about other platforms.

417

418

* NOTE: GNU libiconv does not recognize the character-set name

419

* which some platform nl_langinfo(CODESET) returns, so we should

420

* use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv.

421

422

static const char *

423

default_iconv_charset(const char *charset) {

424

if (charset != NULL && charset[0] != '\0')

425

return charset;

426

#if HAVE_LOCALE_CHARSET && !defined(__APPLE__)

427

/* locale_charset() is broken on Mac OS */

428

return locale_charset();

429

#elif HAVE_NL_LANGINFO

430

return nl_langinfo(CODESET);

431

#else

432

return "";

433

#endif

434

}

435

436

#if defined(_WIN32) && !defined(__CYGWIN__)

437

438

439

* Convert MBS to WCS.

440

* Note: returns -1 if conversion fails.

441

442

int

443

archive_wstring_append_from_mbs(struct archive_wstring *dest,

444

const char *p, size_t len)

445

{

446

int r = archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL);

447

if (r != 0 && errno == ENOMEM)

448

__archive_errx(1, "No memory");

449

return (r);

450

}

451

452

static int

453

archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest,

454

const char *s, size_t length, struct archive_string_conv *sc)

455

{

456

int count, ret = 0;

457

UINT from_cp;

458

459

if (sc != NULL)

460

from_cp = sc->from_cp;

461

else

462

from_cp = get_current_codepage();

463

464

if (from_cp == CP_C_LOCALE) {

465

466

* "C" locale special process.

467

468

wchar_t *ws;

469

const unsigned char *mp;

470

471

if (NULL == archive_wstring_ensure(dest,

472

dest->length + length + 1))

473

return (-1);

474

475

ws = dest->s + dest->length;

476

mp = (const unsigned char *)s;

477

count = 0;

478

while (count < (int)length && *mp) {

479

*ws++ = (wchar_t)*mp++;

480

count++;

481

}

482

} else if (sc != NULL && (sc->flag & SCONV_NORMALIZATION_C)) {

483

484

* Normalize UTF-8 and UTF-16BE and convert it directly

485

* to UTF-16 as wchar_t.

486

487

struct archive_string u16;

488

int saved_flag = sc->flag;/* save current flag. */

489

490

if (is_big_endian())

491

sc->flag |= SCONV_TO_UTF16BE;

492

else

493

sc->flag |= SCONV_TO_UTF16LE;

494

495

if (sc->flag & SCONV_FROM_UTF16) {

496

497

* UTF-16BE/LE NFD ===> UTF-16 NFC

498

499

count = utf16nbytes(s, length);

221

500

} else {

222

/* Unicode has no codes larger than 0x1fffff. */

223

/* TODO: use \uXXXX escape here instead of ? */

224

*p++ = '?';

225

return_val = NULL;

226

}

501

502

* UTF-8 NFD ===> UTF-16 NFC

503

504

count = mbsnbytes(s, length);

505

}

506

u16.s = (char *)dest->s;

507

u16.length = dest->length << 1;;

508

u16.buffer_length = dest->buffer_length;

509

ret = archive_string_normalize_C(&u16, s, count, sc);

510

dest->s = (wchar_t *)u16.s;

511

dest->length = u16.length >> 1;

512

dest->buffer_length = u16.buffer_length;

513

sc->flag = saved_flag;/* restore the saved flag. */

514

return (ret);

515

} else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) {

516

count = utf16nbytes(s, length);

517

count >>= 1; /* to be WCS length */

518

/* Allocate memory for WCS. */

519

if (NULL == archive_wstring_ensure(dest,

520

dest->length + count + 1))

521

return (-1);

522

wmemcpy(dest->s + dest->length, (wchar_t *)s, count);

523

if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) {

524

uint16_t *u16 = (uint16_t *)(dest->s + dest->length);

525

int b;

526

for (b = 0; b < count; b++) {

527

uint16_t val = archive_le16dec(u16+b);

528

archive_be16enc(u16+b, val);

529

}

530

} else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) {

531

uint16_t *u16 = (uint16_t *)(dest->s + dest->length);

532

int b;

533

for (b = 0; b < count; b++) {

534

uint16_t val = archive_be16dec(u16+b);

535

archive_le16enc(u16+b, val);

536

}

537

}

538

} else {

539

DWORD mbflag;

540

541

if (sc == NULL)

542

mbflag = 0;

543

else if (sc->flag & SCONV_FROM_CHARSET) {

544

/* Do not trust the length which comes from

545

* an archive file. */

546

length = mbsnbytes(s, length);

547

mbflag = 0;

548

} else

549

mbflag = MB_PRECOMPOSED;

550

551

if (length == 0) {

552

553

* We do not need to convert any characters but make

554

* sure `dest' has a valid buffer(no NULL pointer).

555

556

if (NULL == archive_wstring_ensure(dest,

557

dest->length + 1))

558

return (-1);

559

dest->s[dest->length] = L'\0';

560

return (0);

561

}

562

563

564

* Count how many bytes are needed for WCS.

565

566

count = MultiByteToWideChar(from_cp,

567

mbflag, s, length, NULL, 0);

568

if (count == 0) {

569

if (dest->s == NULL) {

570

if (NULL == archive_wstring_ensure(dest,

571

dest->length + 1))

572

return (-1);

573

}

574

dest->s[dest->length] = L'\0';

575

return (-1);

576

}

577

/* Allocate memory for WCS. */

578

if (NULL == archive_wstring_ensure(dest,

579

dest->length + count + 1))

580

return (-1);

581

/* Convert MBS to WCS. */

582

count = MultiByteToWideChar(from_cp,

583

mbflag, s, length, dest->s + dest->length, count);

584

if (count == 0)

585

ret = -1;

227

586

}

228

*p = '\0';

229

archive_strcat(as, buff);

230

return (return_val);

587

dest->length += count;

588

dest->s[dest->length] = L'\0';

589

return (ret);

231

590

}

232

591

233

static int

234

utf8_to_unicode(int *pwc, const char *s, size_t n)

592

#elif defined(HAVE_MBSNRTOWCS)

593

594

595

* Convert MBS to WCS.

596

* Note: returns -1 if conversion fails.

597

598

int

599

archive_wstring_append_from_mbs(struct archive_wstring *dest,

600

const char *p, size_t len)

235

601

{

236

int ch;

237

238

239

* Decode 1-4 bytes depending on the value of the first byte.

602

size_t r;

603

604

* No single byte will be more than one wide character,

605

* so this length estimate will always be big enough.

240

606

241

ch = (unsigned char)*s;

242

if (ch == 0) {

243

return (0); /* Standard: return 0 for end-of-string. */

607

size_t wcs_length = len;

608

size_t mbs_length = len;

609

const char *mbs = p;

610

wchar_t *wcs;

611

mbstate_t shift_state;

612

613

memset(&shift_state, 0, sizeof(shift_state));

614

if (NULL == archive_wstring_ensure(dest, dest->length + wcs_length + 1))

615

__archive_errx(1,

616

"No memory for archive_wstring_append_from_mbs()");

617

wcs = dest->s + dest->length;

618

r = mbsnrtowcs(wcs, &mbs, mbs_length, wcs_length, &shift_state);

619

if (r != (size_t)-1) {

620

dest->length += r;

621

dest->s[dest->length] = L'\0';

622

return (0);

244

623

}

245

if ((ch & 0x80) == 0) {

246

*pwc = ch & 0x7f;

247

return (1);

248

}

249

if ((ch & 0xe0) == 0xc0) {

250

if (n < 2)

251

return (-1);

252

if ((s[1] & 0xc0) != 0x80) return (-1);

253

*pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);

254

return (2);

255

}

256

if ((ch & 0xf0) == 0xe0) {

257

if (n < 3)

258

return (-1);

259

if ((s[1] & 0xc0) != 0x80) return (-1);

260

if ((s[2] & 0xc0) != 0x80) return (-1);

261

*pwc = ((ch & 0x0f) << 12)

262

| ((s[1] & 0x3f) << 6)

263

| (s[2] & 0x3f);

264

return (3);

265

}

266

if ((ch & 0xf8) == 0xf0) {

267

if (n < 4)

268

return (-1);

269

if ((s[1] & 0xc0) != 0x80) return (-1);

270

if ((s[2] & 0xc0) != 0x80) return (-1);

271

if ((s[3] & 0xc0) != 0x80) return (-1);

272

*pwc = ((ch & 0x07) << 18)

273

| ((s[1] & 0x3f) << 12)

274

| ((s[2] & 0x3f) << 6)

275

| (s[3] & 0x3f);

276

return (4);

277

}

278

/* Invalid first byte. */

624

dest->s[dest->length] = L'\0';

279

625

return (-1);

280

626

}

281

627

628

#else

629

282

630

283

* Return a wide-character Unicode string by converting this archive_string

284

* from UTF-8. We assume that systems with 16-bit wchar_t always use

285

* UTF16 and systems with 32-bit wchar_t can accept UCS4.

631

* Convert MBS to WCS.

632

* Note: returns -1 if conversion fails.

286

633

287

wchar_t *

288

__archive_string_utf8_w(struct archive_string *as)

634

int

635

archive_wstring_append_from_mbs(struct archive_wstring *dest,

636

const char *p, size_t len)

289

637

{

290

wchar_t *ws, *dest;

291

int wc, wc2;/* Must be large enough for a 21-bit Unicode code point. */

292

const char *src;

293

int n;

638

size_t r;

639

640

* No single byte will be more than one wide character,

641

* so this length estimate will always be big enough.

642

643

size_t wcs_length = len;

644

size_t mbs_length = len;

645

const char *mbs = p;

646

wchar_t *wcs;

647

#if HAVE_MBRTOWC

648

mbstate_t shift_state;

294

649

295

ws = (wchar_t *)malloc((as->length + 1) * sizeof(wchar_t));

296

if (ws == NULL)

297

__archive_errx(1, "Out of memory");

298

dest = ws;

299

src = as->s;

300

while (*src != '\0') {

301

n = utf8_to_unicode(&wc, src, 8);

302

if (n == 0)

650

memset(&shift_state, 0, sizeof(shift_state));

651

#endif

652

if (NULL == archive_wstring_ensure(dest, dest->length + wcs_length + 1))

653

__archive_errx(1,

654

"No memory for archive_wstring_append_from_mbs()");

655

wcs = dest->s + dest->length;

656

657

* We cannot use mbsrtowcs/mbstowcs here because those may convert

658

* extra MBS when strlen(p) > len and one wide character consis of

659

* multi bytes.

660

661

while (wcs_length > 0 && *mbs && mbs_length > 0) {

662

#if HAVE_MBRTOWC

663

r = mbrtowc(wcs, mbs, wcs_length, &shift_state);

664

#else

665

r = mbtowc(wcs, mbs, wcs_length);

666

#endif

667

if (r == (size_t)-1 || r == (size_t)-2) {

668

dest->s[dest->length] = L'\0';

669

return (-1);

670

}

671

if (r == 0 || r > mbs_length)

303

672

break;

304

if (n < 0) {

305

free(ws);

306

return (NULL);

307

}

308

src += n;

309

if (wc >= 0xDC00 && wc <= 0xDBFF) {

310

/* This is a leading surrogate; some idiot

311

* has translated UTF16 to UTF8 without combining

312

* surrogates; rebuild the full code point before

313

* continuing. */

314

n = utf8_to_unicode(&wc2, src, 8);

315

if (n < 0) {

316

free(ws);

317

return (NULL);

318

}

319

if (n == 0) /* Ignore the leading surrogate */

320

break;

321

if (wc2 < 0xDC00 || wc2 > 0xDFFF) {

322

/* If the second character isn't a

323

* trailing surrogate, then someone

324

* has really screwed up and this is

325

* invalid. */

326

free(ws);

327

return (NULL);

328

} else {

329

src += n;

330

wc -= 0xD800;

331

wc *= 0x400;

332

wc += wc2 - 0xDC00;

333

wc += 0x10000;

334

}

335

}

336

if ((sizeof(wchar_t) < 4) && (wc > 0xffff)) {

337

/* We have a code point that won't fit into a

338

* wchar_t; convert it to a surrogate pair. */

339

wc -= 0x10000;

340

*dest++ = ((wc >> 10) & 0x3ff) + 0xD800;

341

*dest++ = (wc & 0x3ff) + 0xDC00;

342

} else

343

*dest++ = wc;

673

wcs++;

674

wcs_length--;

675

mbs += r;

676

mbs_length -= r;

344

677

}

345

*dest = L'\0';

346

return (ws);

678

dest->length = wcs - dest->s;

679

dest->s[dest->length] = L'\0';

680

return (0);

347

681

}

348

682

683

#endif

684

349

685

#if defined(_WIN32) && !defined(__CYGWIN__)

350

686

351

687

352

* Translates a wide character string into current locale character set

353

* and appends to the archive_string. Note: returns NULL if conversion

354

* fails.

688

* WCS ==> MBS.

689

* Note: returns -1 if conversion fails.

355

690

356

691

* Win32 builds use WideCharToMultiByte from the Windows API.

357

692

* (Maybe Cygwin should too? WideCharToMultiByte will know a

358

693

* lot more about local character encodings than the wcrtomb()

359

694

* wrapper is going to know.)

360

695

361

struct archive_string *

362

__archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)

363

{

364

char *p;

365

int l, wl;

366

BOOL useDefaultChar = FALSE;

367

368

wl = (int)wcslen(w);

369

l = wl * 4 + 4;

370

p = malloc(l);

371

if (p == NULL)

372

__archive_errx(1, "Out of memory");

373

/* To check a useDefaultChar is to simulate error handling of

374

* the my_wcstombs() which is running on non Windows system with

375

* wctomb().

376

* And to set NULL for last argument is necessary when a codepage

377

* is not CP_ACP(current locale).

378

379

l = WideCharToMultiByte(CP_ACP, 0, w, wl, p, l, NULL, &useDefaultChar);

380

if (l == 0) {

381

free(p);

382

return (NULL);

383

}

384

__archive_string_append(as, p, l);

385

free(p);

386

return (as);

387

}

388

389

#else

390

391

392

* Translates a wide character string into current locale character set

393

* and appends to the archive_string. Note: returns NULL if conversion

394

* fails.

395

396

* Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion

397

* one character at a time. If a non-Windows platform doesn't have

398

* either of these, fall back to the built-in UTF8 conversion.

399

400

struct archive_string *

401

__archive_strappend_w_mbs(struct archive_string *as, const wchar_t *w)

402

{

403

#if !defined(HAVE_WCTOMB) && !defined(HAVE_WCRTOMB)

404

/* If there's no built-in locale support, fall back to UTF8 always. */

405

return __archive_strappend_w_utf8(as, w);

406

#else

696

int

697

archive_string_append_from_wcs(struct archive_string *as,

698

const wchar_t *w, size_t len)

699

{

700

int r = archive_string_append_from_wcs_in_codepage(as, w, len, NULL);

701

if (r != 0 && errno == ENOMEM)

702

__archive_errx(1, "No memory");

703

return (r);

704

}

705

706

static int

707

archive_string_append_from_wcs_in_codepage(struct archive_string *as,

708

const wchar_t *ws, size_t len, struct archive_string_conv *sc)

709

{

710

BOOL defchar_used, *dp;

711

int count, ret = 0;

712

UINT to_cp;

713

int wslen = (int)len;

714

715

if (sc != NULL)

716

to_cp = sc->to_cp;

717

else

718

to_cp = get_current_codepage();

719

720

if (to_cp == CP_C_LOCALE) {

721

722

* "C" locale special process.

723

724

const wchar_t *wp = ws;

725

char *p;

726

727

if (NULL == archive_string_ensure(as,

728

as->length + wslen +1))

729

return (-1);

730

p = as->s + as->length;

731

count = 0;

732

defchar_used = 0;

733

while (count < wslen && *wp) {

734

if (*wp > 255) {

735

*p++ = '?';

736

wp++;

737

defchar_used = 1;

738

} else

739

*p++ = (char)*wp++;

740

count++;

741

}

742

} else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) {

743

uint16_t *u16;

744

745

if (NULL ==

746

archive_string_ensure(as, as->length + len * 2 + 2))

747

return (-1);

748

u16 = (uint16_t *)(as->s + as->length);

749

count = 0;

750

defchar_used = 0;

751

if (sc->flag & SCONV_TO_UTF16BE) {

752

while (count < (int)len && *ws) {

753

archive_be16enc(u16+count, *ws);

754

ws++;

755

count++;

756

}

757

} else {

758

while (count < (int)len && *ws) {

759

archive_le16enc(u16+count, *ws);

760

ws++;

761

count++;

762

}

763

}

764

count <<= 1; /* to be byte size */

765

} else {

766

/* Make sure the MBS buffer has plenty to set. */

767

if (NULL ==

768

archive_string_ensure(as, as->length + len * 2 + 1))

769

return (-1);

770

do {

771

defchar_used = 0;

772

if (to_cp == CP_UTF8 || sc == NULL)

773

dp = NULL;

774

else

775

dp = &defchar_used;

776

count = WideCharToMultiByte(to_cp, 0, ws, wslen,

777

as->s + as->length, as->buffer_length-1, NULL, dp);

778

if (count == 0 &&

779

GetLastError() == ERROR_INSUFFICIENT_BUFFER) {

780

/* Expand the MBS buffer and retry. */

781

if (NULL == archive_string_ensure(as,

782

as->buffer_length + len))

783

return (-1);

784

continue;

785

}

786

if (count == 0)

787

ret = -1;

788

} while (0);

789

}

790

as->length += count;

791

as->s[as->length] = '\0';

792

return (defchar_used?-1:ret);

793

}

794

795

#elif defined(HAVE_WCSNRTOMBS)

796

797

798

* Translates a wide character string into current locale character set

799

* and appends to the archive_string. Note: returns -1 if conversion

800

* fails.

801

802

int

803

archive_string_append_from_wcs(struct archive_string *as,

804

const wchar_t *w, size_t len)

805

{

806

mbstate_t shift_state;

807

size_t r, ndest, nwc;

808

char *dest;

809

const wchar_t *wp, *wpp;

810

int ret_val = 0;

811

812

wp = w;

813

nwc = len;

814

ndest = len * 2;

815

/* Initialize the shift state. */

816

memset(&shift_state, 0, sizeof(shift_state));

817

while (nwc > 0) {

818

/* Allocate buffer for MBS. */

819

if (archive_string_ensure(as, as->length + ndest + 1) == NULL)

820

__archive_errx(1, "Out of memory");

821

822

dest = as->s + as->length;

823

wpp = wp;

824

r = wcsnrtombs(dest, &wp, nwc,

825

as->buffer_length - as->length -1,

826

&shift_state);

827

if (r == (size_t)-1) {

828

if (errno == EILSEQ) {

829

/* Retry conversion just for safe WCS. */

830

size_t xwc = wp - wpp;

831

wp = wpp;

832

r = wcsnrtombs(dest, &wp, xwc,

833

as->buffer_length - as->length -1,

834

&shift_state);

835

if (r == (size_t)-1)

836

/* This would not happen. */

837

return (-1);

838

as->length += r;

839

nwc -= wp - wpp;

840

/* Skip an illegal wide char. */

841

as->s[as->length++] = '?';

842

wp++;

843

nwc--;

844

ret_val = -1;

845

continue;

846

} else {

847

ret_val = -1;

848

break;

849

}

850

}

851

as->length += r;

852

if (wp == NULL || (wp - wpp) >= nwc)

853

break;

854

/* Get a remaining WCS lenth. */

855

nwc -= wp - wpp;

856

}

857

/* All wide characters are translated to MBS. */

858

as->s[as->length] = '\0';

859

return (ret_val);

860

}

861

862

#elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB)

863

864

865

* Translates a wide character string into current locale character set

866

* and appends to the archive_string. Note: returns -1 if conversion

867

* fails.

868

869

int

870

archive_string_append_from_wcs(struct archive_string *as,

871

const wchar_t *w, size_t len)

872

{

407

873

/* We cannot use the standard wcstombs() here because it

408

874

* cannot tell us how big the output buffer should be. So

409

875

* I've built a loop around wcrtomb() or wctomb() that

410

876

* converts a character at a time and resizes the string as

411

877

* needed. We prefer wcrtomb() when it's available because

412

878

* it's thread-safe. */

413

int n;

879

int n, ret_val = 0;

414

880

char *p;

415

char buff[256];

881

char *end;

416

882

#if HAVE_WCRTOMB

417

883

mbstate_t shift_state;

418

884

421

887

/* Clear the shift state before starting. */

422

888

wctomb(NULL, L'\0');

423

889

#endif

424

425

890

426

* Convert one wide char at a time into 'buff', whenever that

427

* fills, append it to the string.

891

* Allocate buffer for MBS.

892

* We need this allocation here since it is possible that

893

* as->s is still NULL.

428

894

429

p = buff;

430

while (*w != L'\0') {

431

/* Flush the buffer when we have <=16 bytes free. */

432

/* (No encoding has a single character >16 bytes.) */

433

if ((size_t)(p - buff) >= (size_t)(sizeof(buff) - MB_CUR_MAX)) {

434

*p = '\0';

435

archive_strcat(as, buff);

436

p = buff;

895

if (archive_string_ensure(as, as->length + len + 1) == NULL)

896

__archive_errx(1, "Out of memory");

897

898

p = as->s + as->length;

899

end = as->s + as->buffer_length - MB_CUR_MAX -1;

900

while (*w != L'\0' && len > 0) {

901

if (p >= end) {

902

as->length = p - as->s;

903

as->s[as->length] = '\0';

904

/* Re-allocate buffer for MBS. */

905

if (archive_string_ensure(as,

906

as->length + len * 2 + 1) == NULL)

907

__archive_errx(1, "Out of memory");

908

p = as->s + as->length;

909

end = as->s + as->buffer_length - MB_CUR_MAX -1;

437

910

}

438

911

#if HAVE_WCRTOMB

439

912

n = wcrtomb(p, *w++, &shift_state);

440

913

#else

441

914

n = wctomb(p, *w++);

442

915

#endif

916

if (n == -1) {

917

if (errno == EILSEQ) {

918

/* Skip an illegal wide char. */

919

*p++ = '?';

920

ret_val = -1;

921

} else {

922

ret_val = -1;

923

break;

924

}

925

} else

926

p += n;

927

len--;

928

}

929

as->length = p - as->s;

930

as->s[as->length] = '\0';

931

return (ret_val);

932

}

933

934

#else /* HAVE_WCTOMB || HAVE_WCRTOMB */

935

936

937

* TODO: Test if __STDC_ISO_10646__ is defined.

938

* Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion

939

* one character at a time. If a non-Windows platform doesn't have

940

* either of these, fall back to the built-in UTF8 conversion.

941

942

int

943

archive_string_append_from_wcs(struct archive_string *as,

944

const wchar_t *w, size_t len)

945

{

946

(void)as;/* UNUSED */

947

(void)w;/* UNUSED */

948

(void)len;/* UNUSED */

949

return (-1);

950

}

951

952

#endif /* HAVE_WCTOMB || HAVE_WCRTOMB */

953

954

955

* Find a string conversion object by a pair of 'from' charset name

956

* and 'to' charset name from an archive object.

957

* Return NULL if not found.

958

959

static struct archive_string_conv *

960

find_sconv_object(struct archive *a, const char *fc, const char *tc)

961

{

962

struct archive_string_conv *sc;

963

964

if (a == NULL)

965

return (NULL);

966

967

for (sc = a->sconv; sc != NULL; sc = sc->next) {

968

if (strcmp(sc->from_charset, fc) == 0 &&

969

strcmp(sc->to_charset, tc) == 0)

970

break;

971

}

972

return (sc);

973

}

974

975

976

* Register a string object to an archive object.

977

978

static void

979

add_sconv_object(struct archive *a, struct archive_string_conv *sc)

980

{

981

struct archive_string_conv **psc;

982

983

/* Add a new sconv to sconv list. */

984

psc = &(a->sconv);

985

while (*psc != NULL)

986

psc = &((*psc)->next);

987

*psc = sc;

988

}

989

990

#if defined(__APPLE__)

991

992

static int

993

createUniInfo(struct archive_string_conv *sconv)

994

{

995

UnicodeMapping map;

996

OSStatus err;

997

998

map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,

999

kUnicodeNoSubset, kUnicode16BitFormat);

1000

map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,

1001

kUnicodeHFSPlusDecompVariant, kUnicode16BitFormat);

1002

map.mappingVersion = kUnicodeUseLatestMapping;

1003

1004

sconv->uniInfo = NULL;

1005

err = CreateUnicodeToTextInfo(&map, &(sconv->uniInfo));

1006

return ((err == noErr)? 0: -1);

1007

}

1008

1009

#endif /* __APPLE__ */

1010

1011

static void

1012

add_converter(struct archive_string_conv *sc, int (*converter)

1013

(struct archive_string *, const void *, size_t,

1014

struct archive_string_conv *))

1015

{

1016

if (sc == NULL || sc->nconverter >= 2)

1017

__archive_errx(1, "Programing error");

1018

sc->converter[sc->nconverter++] = converter;

1019

}

1020

1021

static void

1022

setup_converter(struct archive_string_conv *sc)

1023

{

1024

1025

/* Reset. */

1026

sc->nconverter = 0;

1027

1028

1029

* Perform special sequence for the incorrect UTF-8 filenames

1030

* made by libarchive2.x.

1031

1032

if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) {

1033

add_converter(sc, strncat_from_utf8_libarchive2);

1034

return;

1035

}

1036

1037

1038

* Convert a string to UTF-16BE/LE.

1039

1040

if (sc->flag & SCONV_TO_UTF16) {

1041

1042

* If the current locale is UTF-8, we can translate

1043

* a UTF-8 string into a UTF-16BE string.

1044

1045

if (sc->flag & SCONV_FROM_UTF8) {

1046

add_converter(sc, archive_string_append_unicode);

1047

return;

1048

}

1049

1050

#if defined(_WIN32) && !defined(__CYGWIN__)

1051

if (sc->flag & SCONV_WIN_CP) {

1052

if (sc->flag & SCONV_TO_UTF16BE)

1053

add_converter(sc, win_strncat_to_utf16be);

1054

else

1055

add_converter(sc, win_strncat_to_utf16le);

1056

return;

1057

}

1058

#endif

1059

1060

#if defined(HAVE_ICONV)

1061

if (sc->cd != (iconv_t)-1) {

1062

add_converter(sc, iconv_strncat_in_locale);

1063

return;

1064

}

1065

#endif

1066

1067

if (sc->flag & SCONV_BEST_EFFORT) {

1068

if (sc->flag & SCONV_TO_UTF16BE)

1069

add_converter(sc, best_effort_strncat_to_utf16be);

1070

else

1071

add_converter(sc, best_effort_strncat_to_utf16le);

1072

} else

1073

/* Make sure we have no converter. */

1074

sc->nconverter = 0;

1075

return;

1076

}

1077

1078

1079

* Convert a string from UTF-16BE/LE.

1080

1081

if (sc->flag & SCONV_FROM_UTF16) {

1082

1083

* At least we should normalize a UTF-16BE string.

1084

1085

#if defined(__APPLE__)

1086

if (sc->flag & SCONV_NORMALIZATION_D)

1087

add_converter(sc,archive_string_normalize_D);

1088

else

1089

#endif

1090

if (sc->flag & SCONV_NORMALIZATION_C)

1091

add_converter(sc, archive_string_normalize_C);

1092

1093

if (sc->flag & SCONV_TO_UTF8) {

1094

1095

* If the current locale is UTF-8, we can translate

1096

* a UTF-16BE/LE string into a UTF-8 string directly.

1097

1098

if (!(sc->flag &

1099

(SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))

1100

add_converter(sc,

1101

archive_string_append_unicode);

1102

return;

1103

}

1104

1105

#if defined(_WIN32) && !defined(__CYGWIN__)

1106

if (sc->flag & SCONV_WIN_CP) {

1107

if (sc->flag & SCONV_FROM_UTF16BE)

1108

add_converter(sc, win_strncat_from_utf16be);

1109

else

1110

add_converter(sc, win_strncat_from_utf16le);

1111

return;

1112

}

1113

#endif

1114

1115

#if defined(HAVE_ICONV)

1116

if (sc->cd != (iconv_t)-1) {

1117

add_converter(sc, iconv_strncat_in_locale);

1118

return;

1119

}

1120

#endif

1121

1122

if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))

1123

== (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE))

1124

add_converter(sc, best_effort_strncat_from_utf16be);

1125

else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))

1126

== (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE))

1127

add_converter(sc, best_effort_strncat_from_utf16le);

1128

else

1129

/* Make sure we have no converter. */

1130

sc->nconverter = 0;

1131

return;

1132

}

1133

1134

if (sc->flag & SCONV_FROM_UTF8) {

1135

1136

* At least we should normalize a UTF-8 string.

1137

1138

#if defined(__APPLE__)

1139

if (sc->flag & SCONV_NORMALIZATION_D)

1140

add_converter(sc,archive_string_normalize_D);

1141

else

1142

#endif

1143

if (sc->flag & SCONV_NORMALIZATION_C)

1144

add_converter(sc, archive_string_normalize_C);

1145

1146

1147

* Copy UTF-8 string with a check of CESU-8.

1148

* Apparently, iconv does not check surrogate pairs in UTF-8

1149

* when both from-charset and to-charset are UTF-8, and then

1150

* we use our UTF-8 copy code.

1151

1152

if (sc->flag & SCONV_TO_UTF8) {

1153

1154

* If the current locale is UTF-8, we can translate

1155

* a UTF-16BE string into a UTF-8 string directly.

1156

1157

if (!(sc->flag &

1158

(SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C)))

1159

add_converter(sc, strncat_from_utf8_to_utf8);

1160

return;

1161

}

1162

}

1163

1164

#if defined(_WIN32) && !defined(__CYGWIN__)

1165

1166

* On Windows we can use Windows API for a string conversion.

1167

1168

if (sc->flag & SCONV_WIN_CP) {

1169

add_converter(sc, strncat_in_codepage);

1170

return;

1171

}

1172

#endif

1173

1174

#if HAVE_ICONV

1175

if (sc->cd != (iconv_t)-1) {

1176

add_converter(sc, iconv_strncat_in_locale);

1177

return;

1178

}

1179

#endif

1180

1181

1182

* Try conversion in the best effort or no conversion.

1183

1184

if ((sc->flag & SCONV_BEST_EFFORT) || sc->same)

1185

add_converter(sc, best_effort_strncat_in_locale);

1186

else

1187

/* Make sure we have no converter. */

1188

sc->nconverter = 0;

1189

}

1190

1191

1192

* Return canonicalized charset-name but this supports just UTF-8, UTF-16BE

1193

* and CP932 which are referenced in create_sconv_object().

1194

1195

static const char *

1196

canonical_charset_name(const char *charset)

1197

{

1198

char cs[16];

1199

char *p;

1200

const char *s;

1201

1202

if (charset == NULL || charset[0] == '\0'

1203

|| strlen(charset) > 15)

1204

return (charset);

1205

1206

/* Copy name to uppercase. */

1207

p = cs;

1208

s = charset;

1209

while (*s) {

1210

char c = *s++;

1211

if (c >= 'a' && c <= 'z')

1212

c -= 'a' - 'A';

1213

*p++ = c;

1214

}

1215

*p++ = '\0';

1216

1217

if (strcmp(cs, "UTF-8") == 0 ||

1218

strcmp(cs, "UTF8") == 0)

1219

return ("UTF-8");

1220

if (strcmp(cs, "UTF-16BE") == 0 ||

1221

strcmp(cs, "UTF16BE") == 0)

1222

return ("UTF-16BE");

1223

if (strcmp(cs, "UTF-16LE") == 0 ||

1224

strcmp(cs, "UTF16LE") == 0)

1225

return ("UTF-16LE");

1226

if (strcmp(cs, "CP932") == 0)

1227

return ("CP932");

1228

return (charset);

1229

}

1230

1231

1232

* Create a string conversion object.

1233

1234

static struct archive_string_conv *

1235

create_sconv_object(const char *fc, const char *tc,

1236

unsigned current_codepage, int flag)

1237

{

1238

struct archive_string_conv *sc;

1239

1240

sc = calloc(1, sizeof(*sc));

1241

if (sc == NULL)

1242

return (NULL);

1243

sc->next = NULL;

1244

sc->from_charset = strdup(fc);

1245

if (sc->from_charset == NULL) {

1246

free(sc);

1247

return (NULL);

1248

}

1249

sc->to_charset = strdup(tc);

1250

if (sc->to_charset == NULL) {

1251

free(sc);

1252

free(sc->from_charset);

1253

return (NULL);

1254

}

1255

archive_string_init(&sc->utftmp);

1256

#if defined(__APPLE__)

1257

archive_string_init(&sc->utf16nfc);

1258

archive_string_init(&sc->utf16nfd);

1259

#endif

1260

1261

if (flag & SCONV_TO_CHARSET) {

1262

1263

* Convert characters from the current locale charset to

1264

* a specified charset.

1265

1266

sc->from_cp = current_codepage;

1267

sc->to_cp = make_codepage_from_charset(tc);

1268

#if defined(_WIN32) && !defined(__CYGWIN__)

1269

if (IsValidCodePage(sc->to_cp))

1270

flag |= SCONV_WIN_CP;

1271

#endif

1272

} else if (flag & SCONV_FROM_CHARSET) {

1273

1274

* Convert characters from a specified charset to

1275

* the current locale charset.

1276

1277

sc->to_cp = current_codepage;

1278

sc->from_cp = make_codepage_from_charset(fc);

1279

#if defined(_WIN32) && !defined(__CYGWIN__)

1280

if (IsValidCodePage(sc->from_cp))

1281

flag |= SCONV_WIN_CP;

1282

#endif

1283

}

1284

1285

1286

* Check if "from charset" and "to charset" are the same.

1287

1288

if (strcmp(fc, tc) == 0 ||

1289

(sc->from_cp != -1 && sc->from_cp == sc->to_cp))

1290

sc->same = 1;

1291

else

1292

sc->same = 0;

1293

1294

1295

* Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE.

1296

1297

if (strcmp(tc, "UTF-8") == 0)

1298

flag |= SCONV_TO_UTF8;

1299

else if (strcmp(tc, "UTF-16BE") == 0)

1300

flag |= SCONV_TO_UTF16BE;

1301

else if (strcmp(tc, "UTF-16LE") == 0)

1302

flag |= SCONV_TO_UTF16LE;

1303

if (strcmp(fc, "UTF-8") == 0)

1304

flag |= SCONV_FROM_UTF8;

1305

else if (strcmp(fc, "UTF-16BE") == 0)

1306

flag |= SCONV_FROM_UTF16BE;

1307

else if (strcmp(fc, "UTF-16LE") == 0)

1308

flag |= SCONV_FROM_UTF16LE;

1309

#if defined(_WIN32) && !defined(__CYGWIN__)

1310

if (sc->to_cp == CP_UTF8)

1311

flag |= SCONV_TO_UTF8;

1312

else if (sc->to_cp == CP_UTF16BE)

1313

flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP;

1314

else if (sc->to_cp == CP_UTF16LE)

1315

flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP;

1316

if (sc->from_cp == CP_UTF8)

1317

flag |= SCONV_FROM_UTF8;

1318

else if (sc->from_cp == CP_UTF16BE)

1319

flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP;

1320

else if (sc->from_cp == CP_UTF16LE)

1321

flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP;

1322

#endif

1323

1324

1325

* Set a flag for Unicode NFD. Usually iconv cannot correctly

1326

* handle it. So we have to translate NFD characters to NFC ones

1327

* ourselves before iconv handles. Another reason is to prevent

1328

* that the same sight of two filenames, one is NFC and other

1329

* is NFD, would be in its directory.

1330

* On Mac OS X, although its filesystem layer automatically

1331

* convert filenames to NFD, it would be useful for filename

1332

* comparing to find out the same filenames that we normalize

1333

* that to be NFD ourselves.

1334

1335

if ((flag & SCONV_FROM_CHARSET) &&

1336

(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) {

1337

#if defined(__APPLE__)

1338

if (flag & SCONV_TO_UTF8) {

1339

if (createUniInfo(sc) == 0)

1340

flag |= SCONV_NORMALIZATION_D;

1341

} else

1342

#endif

1343

flag |= SCONV_NORMALIZATION_C;

1344

}

1345

1346

#if defined(HAVE_ICONV)

1347

sc->cd_w = (iconv_t)-1;

1348

1349

* Create an iconv object.

1350

1351

if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) &&

1352

(flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) ||

1353

(flag & SCONV_WIN_CP)) {

1354

/* This case we won't use iconv. */

1355

sc->cd = (iconv_t)-1;

1356

#if defined(__APPLE__)

1357

} else if ((flag & SCONV_FROM_CHARSET) && (flag & SCONV_TO_UTF8)) {

1358

1359

* In case reading an archive file.

1360

* Translate non-Unicode filenames in an archive file to

1361

* UTF-8-MAC filenames.

1362

1363

sc->cd = iconv_open("UTF-8-MAC", fc);

1364

if (sc->cd == (iconv_t)-1) {

1365

if ((sc->flag & SCONV_BEST_EFFORT) &&

1366

strcmp(fc, "CP932") == 0) {

1367

sc->cd = iconv_open("UTF-8-MAC", "SJIS");

1368

if (sc->cd == (iconv_t)-1) {

1369

sc->cd = iconv_open(tc, fc);

1370

if (sc->cd == (iconv_t)-1)

1371

sc->cd = iconv_open(tc, "SJIS");

1372

}

1373

} else

1374

sc->cd = iconv_open(tc, fc);

1375

}

1376

} else if ((flag & SCONV_TO_CHARSET) && (flag & SCONV_FROM_UTF8)) {

1377

1378

* In case writing an archive file.

1379

* Translate UTF-8-MAC filenames in HFS Plus to non-Unicode

1380

* filenames.

1381

1382

sc->cd = iconv_open(tc, "UTF-8-MAC");

1383

if (sc->cd == (iconv_t)-1) {

1384

if ((sc->flag & SCONV_BEST_EFFORT) &&

1385

strcmp(tc, "CP932") == 0) {

1386

sc->cd = iconv_open("SJIS", "UTF-8-MAC");

1387

if (sc->cd == (iconv_t)-1) {

1388

sc->cd = iconv_open(tc, fc);

1389

if (sc->cd == (iconv_t)-1)

1390

sc->cd = iconv_open("SJIS", fc);

1391

}

1392

} else

1393

sc->cd = iconv_open(tc, fc);

1394

}

1395

#endif

1396

} else {

1397

sc->cd = iconv_open(tc, fc);

1398

if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) {

1399

1400

* Unfortunaly, all of iconv implements do support

1401

* "CP932" character-set, so we should use "SJIS"

1402

* instead if iconv_open failed.

1403

1404

if (strcmp(tc, "CP932") == 0)

1405

sc->cd = iconv_open("SJIS", fc);

1406

else if (strcmp(fc, "CP932") == 0)

1407

sc->cd = iconv_open(tc, "SJIS");

1408

}

1409

#if defined(_WIN32) && !defined(__CYGWIN__)

1410

1411

* archive_mstring on Windows directly convert multi-bytes

1412

* into archive_wstring in order not to depend on locale

1413

* so that you can do a I18N programing. This will be

1414

* used only in archive_mstring_copy_mbs_len_l so far.

1415

1416

if (flag & SCONV_FROM_CHARSET) {

1417

sc->cd_w = iconv_open("UTF-8", fc);

1418

if (sc->cd_w == (iconv_t)-1 &&

1419

(sc->flag & SCONV_BEST_EFFORT)) {

1420

if (strcmp(fc, "CP932") == 0)

1421

sc->cd_w = iconv_open("UTF-8", "SJIS");

1422

}

1423

}

1424

#endif /* _WIN32 && !__CYGWIN__ */

1425

}

1426

#endif /* HAVE_ICONV */

1427

1428

sc->flag = flag;

1429

1430

1431

* Setup converters.

1432

1433

setup_converter(sc);

1434

1435

return (sc);

1436

}

1437

1438

1439

* Free a string conversion object.

1440

1441

static void

1442

free_sconv_object(struct archive_string_conv *sc)

1443

{

1444

free(sc->from_charset);

1445

free(sc->to_charset);

1446

archive_string_free(&sc->utftmp);

1447

#if HAVE_ICONV

1448

if (sc->cd != (iconv_t)-1)

1449

iconv_close(sc->cd);

1450

if (sc->cd_w != (iconv_t)-1)

1451

iconv_close(sc->cd_w);

1452

#endif

1453

#if defined(__APPLE__)

1454

archive_string_free(&sc->utf16nfc);

1455

archive_string_free(&sc->utf16nfd);

1456

if (sc->uniInfo != NULL)

1457

DisposeUnicodeToTextInfo(&(sc->uniInfo));

1458

#endif

1459

free(sc);

1460

}

1461

1462

#if defined(_WIN32) && !defined(__CYGWIN__)

1463

static unsigned

1464

my_atoi(const char *p)

1465

{

1466

unsigned cp;

1467

1468

cp = 0;

1469

while (*p) {

1470

if (*p >= '0' && *p <= '9')

1471

cp = cp * 10 + (*p - '0');

1472

else

1473

return (-1);

1474

p++;

1475

}

1476

return (cp);

1477

}

1478

1479

1480

* Translate Charset name (as used by iconv) into CodePage (as used by Windows)

1481

* Return -1 if failed.

1482

1483

* Note: This translation code may be insufficient.

1484

1485

static struct charset {

1486

const char *name;

1487

unsigned cp;

1488

} charsets[] = {

1489

/* MUST BE SORTED! */

1490

{"ASCII", 1252},

1491

{"ASMO-708", 708},

1492

{"BIG5", 950},

1493

{"CHINESE", 936},

1494

{"CP367", 1252},

1495

{"CP819", 1252},

1496

{"CP1025", 21025},

1497

{"DOS-720", 720},

1498

{"DOS-862", 862},

1499

{"EUC-CN", 51936},

1500

{"EUC-JP", 51932},

1501

{"EUC-KR", 949},

1502

{"EUCCN", 51936},

1503

{"EUCJP", 51932},

1504

{"EUCKR", 949},

1505

{"GB18030", 54936},

1506

{"GB2312", 936},

1507

{"HEBREW", 1255},

1508

{"HZ-GB-2312", 52936},

1509

{"IBM273", 20273},

1510

{"IBM277", 20277},

1511

{"IBM278", 20278},

1512

{"IBM280", 20280},

1513

{"IBM284", 20284},

1514

{"IBM285", 20285},

1515

{"IBM290", 20290},

1516

{"IBM297", 20297},

1517

{"IBM367", 1252},

1518

{"IBM420", 20420},

1519

{"IBM423", 20423},

1520

{"IBM424", 20424},

1521

{"IBM819", 1252},

1522

{"IBM871", 20871},

1523

{"IBM880", 20880},

1524

{"IBM905", 20905},

1525

{"IBM924", 20924},

1526

{"ISO-8859-1", 28591},

1527

{"ISO-8859-13", 28603},

1528

{"ISO-8859-15", 28605},

1529

{"ISO-8859-2", 28592},

1530

{"ISO-8859-3", 28593},

1531

{"ISO-8859-4", 28594},

1532

{"ISO-8859-5", 28595},

1533

{"ISO-8859-6", 28596},

1534

{"ISO-8859-7", 28597},

1535

{"ISO-8859-8", 28598},

1536

{"ISO-8859-9", 28599},

1537

{"ISO8859-1", 28591},

1538

{"ISO8859-13", 28603},

1539

{"ISO8859-15", 28605},

1540

{"ISO8859-2", 28592},

1541

{"ISO8859-3", 28593},

1542

{"ISO8859-4", 28594},

1543

{"ISO8859-5", 28595},

1544

{"ISO8859-6", 28596},

1545

{"ISO8859-7", 28597},

1546

{"ISO8859-8", 28598},

1547

{"ISO8859-9", 28599},

1548

{"JOHAB", 1361},

1549

{"KOI8-R", 20866},

1550

{"KOI8-U", 21866},

1551

{"KS_C_5601-1987", 949},

1552

{"LATIN1", 1252},

1553

{"LATIN2", 28592},

1554

{"MACINTOSH", 10000},

1555

{"SHIFT-JIS", 932},

1556

{"SHIFT_JIS", 932},

1557

{"SJIS", 932},

1558

{"US", 1252},

1559

{"US-ASCII", 1252},

1560

{"UTF-16", 1200},

1561

{"UTF-16BE", 1201},

1562

{"UTF-16LE", 1200},

1563

{"UTF-8", CP_UTF8},

1564

{"X-EUROPA", 29001},

1565

{"X-MAC-ARABIC", 10004},

1566

{"X-MAC-CE", 10029},

1567

{"X-MAC-CHINESEIMP", 10008},

1568

{"X-MAC-CHINESETRAD", 10002},

1569

{"X-MAC-CROATIAN", 10082},

1570

{"X-MAC-CYRILLIC", 10007},

1571

{"X-MAC-GREEK", 10006},

1572

{"X-MAC-HEBREW", 10005},

1573

{"X-MAC-ICELANDIC", 10079},

1574

{"X-MAC-JAPANESE", 10001},

1575

{"X-MAC-KOREAN", 10003},

1576

{"X-MAC-ROMANIAN", 10010},

1577

{"X-MAC-THAI", 10021},

1578

{"X-MAC-TURKISH", 10081},

1579

{"X-MAC-UKRAINIAN", 10017},

1580

};

1581

static unsigned

1582

make_codepage_from_charset(const char *charset)

1583

{

1584

char cs[16];

1585

char *p;

1586

unsigned cp;

1587

int a, b;

1588

1589

if (charset == NULL || strlen(charset) > 15)

1590

return -1;

1591

1592

/* Copy name to uppercase. */

1593

p = cs;

1594

while (*charset) {

1595

char c = *charset++;

1596

if (c >= 'a' && c <= 'z')

1597

c -= 'a' - 'A';

1598

*p++ = c;

1599

}

1600

*p++ = '\0';

1601

cp = -1;

1602

1603

/* Look it up in the table first, so that we can easily

1604

* override CP367, which we map to 1252 instead of 367. */

1605

a = 0;

1606

b = sizeof(charsets)/sizeof(charsets[0]);

1607

while (b > a) {

1608

int c = (b + a) / 2;

1609

int r = strcmp(charsets[c].name, cs);

1610

if (r < 0)

1611

a = c + 1;

1612

else if (r > 0)

1613

b = c;

1614

else

1615

return charsets[c].cp;

1616

}

1617

1618

/* If it's not in the table, try to parse it. */

1619

switch (*cs) {

1620

case 'C':

1621

if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') {

1622

cp = my_atoi(cs + 2);

1623

} else if (strcmp(cs, "CP_ACP") == 0)

1624

cp = get_current_codepage();

1625

else if (strcmp(cs, "CP_OEMCP") == 0)

1626

cp = get_current_oemcp();

1627

break;

1628

case 'I':

1629

if (cs[1] == 'B' && cs[2] == 'M' &&

1630

cs[3] >= '0' && cs[3] <= '9') {

1631

cp = my_atoi(cs + 3);

1632

}

1633

break;

1634

case 'W':

1635

if (strncmp(cs, "WINDOWS-", 8) == 0) {

1636

cp = my_atoi(cs + 8);

1637

if (cp != 874 && (cp < 1250 || cp > 1258))

1638

cp = -1;/* This may invalid code. */

1639

}

1640

break;

1641

}

1642

return (cp);

1643

}

1644

1645

1646

* Return ANSI Code Page of current locale set by setlocale().

1647

1648

static unsigned

1649

get_current_codepage()

1650

{

1651

char *locale, *p;

1652

unsigned cp;

1653

1654

locale = setlocale(LC_CTYPE, NULL);

1655

if (locale == NULL)

1656

return (GetACP());

1657

if (locale[0] == 'C' && locale[1] == '\0')

1658

return (CP_C_LOCALE);

1659

p = strrchr(locale, '.');

1660

if (p == NULL)

1661

return (GetACP());

1662

cp = my_atoi(p+1);

1663

if (cp <= 0)

1664

return (GetACP());

1665

return (cp);

1666

}

1667

1668

1669

* Translation table between Locale Name and ACP/OEMCP.

1670

1671

static struct {

1672

unsigned acp;

1673

unsigned ocp;

1674

const char *locale;

1675

} acp_ocp_map[] = {

1676

{ 950, 950, "Chinese_Taiwan" },

1677

{ 936, 936, "Chinese_People's Republic of China" },

1678

{ 950, 950, "Chinese_Taiwan" },

1679

{ 1250, 852, "Czech_Czech Republic" },

1680

{ 1252, 850, "Danish_Denmark" },

1681

{ 1252, 850, "Dutch_Netherlands" },

1682

{ 1252, 850, "Dutch_Belgium" },

1683

{ 1252, 437, "English_United States" },

1684

{ 1252, 850, "English_Australia" },

1685

{ 1252, 850, "English_Canada" },

1686

{ 1252, 850, "English_New Zealand" },

1687

{ 1252, 850, "English_United Kingdom" },

1688

{ 1252, 437, "English_United States" },

1689

{ 1252, 850, "Finnish_Finland" },

1690

{ 1252, 850, "French_France" },

1691

{ 1252, 850, "French_Belgium" },

1692

{ 1252, 850, "French_Canada" },

1693

{ 1252, 850, "French_Switzerland" },

1694

{ 1252, 850, "German_Germany" },

1695

{ 1252, 850, "German_Austria" },

1696

{ 1252, 850, "German_Switzerland" },

1697

{ 1253, 737, "Greek_Greece" },

1698

{ 1250, 852, "Hungarian_Hungary" },

1699

{ 1252, 850, "Icelandic_Iceland" },

1700

{ 1252, 850, "Italian_Italy" },

1701

{ 1252, 850, "Italian_Switzerland" },

1702

{ 932, 932, "Japanese_Japan" },

1703

{ 949, 949, "Korean_Korea" },

1704

{ 1252, 850, "Norwegian (BokmOl)_Norway" },

1705

{ 1252, 850, "Norwegian (BokmOl)_Norway" },

1706

{ 1252, 850, "Norwegian-Nynorsk_Norway" },

1707

{ 1250, 852, "Polish_Poland" },

1708

{ 1252, 850, "Portuguese_Portugal" },

1709

{ 1252, 850, "Portuguese_Brazil" },

1710

{ 1251, 866, "Russian_Russia" },

1711

{ 1250, 852, "Slovak_Slovakia" },

1712

{ 1252, 850, "Spanish_Spain" },

1713

{ 1252, 850, "Spanish_Mexico" },

1714

{ 1252, 850, "Spanish_Spain" },

1715

{ 1252, 850, "Swedish_Sweden" },

1716

{ 1254, 857, "Turkish_Turkey" },

1717

{ 0, 0, NULL}

1718

};

1719

1720

1721

* Return OEM Code Page of current locale set by setlocale().

1722

1723

static unsigned

1724

get_current_oemcp()

1725

{

1726

int i;

1727

char *locale, *p;

1728

size_t len;

1729

1730

locale = setlocale(LC_CTYPE, NULL);

1731

if (locale == NULL)

1732

return (GetOEMCP());

1733

if (locale[0] == 'C' && locale[1] == '\0')

1734

return (CP_C_LOCALE);

1735

1736

p = strrchr(locale, '.');

1737

if (p == NULL)

1738

return (GetOEMCP());

1739

len = p - locale;

1740

for (i = 0; acp_ocp_map[i].acp; i++) {

1741

if (strncmp(acp_ocp_map[i].locale, locale, len) == 0)

1742

return (acp_ocp_map[i].ocp);

1743

}

1744

return (GetOEMCP());

1745

}

1746

#else

1747

1748

1749

* POSIX platform does not use CodePage.

1750

1751

1752

static unsigned

1753

get_current_codepage()

1754

{

1755

return (-1);/* Unknown */

1756

}

1757

static unsigned

1758

make_codepage_from_charset(const char *charset)

1759

{

1760

(void)charset; /* UNUSED */

1761

return (-1);/* Unknown */

1762

}

1763

static unsigned

1764

get_current_oemcp()

1765

{

1766

return (-1);/* Unknown */

1767

}

1768

1769

#endif /* defined(_WIN32) && !defined(__CYGWIN__) */

1770

1771

1772

* Return a string conversion object.

1773

1774

static struct archive_string_conv *

1775

get_sconv_object(struct archive *a, const char *fc, const char *tc, int flag)

1776

{

1777

struct archive_string_conv *sc;

1778

unsigned current_codepage;

1779

1780

/* Check if we have made the sconv object. */

1781

sc = find_sconv_object(a, fc, tc);

1782

if (sc != NULL)

1783

return (sc);

1784

1785

if (a == NULL)

1786

current_codepage = get_current_codepage();

1787

else

1788

current_codepage = a->current_codepage;

1789

1790

sc = create_sconv_object(canonical_charset_name(fc),

1791

canonical_charset_name(tc), current_codepage, flag);

1792

if (sc == NULL) {

1793

if (a != NULL)

1794

archive_set_error(a, ENOMEM,

1795

"Could not allocate memory for "

1796

"a string conversion object");

1797

return (NULL);

1798

}

1799

1800

1801

* If there is no converter for current string conversion object,

1802

* we cannot handle this conversion.

1803

1804

if (sc->nconverter == 0) {

1805

if (a != NULL) {

1806

#if HAVE_ICONV

1807

archive_set_error(a, ARCHIVE_ERRNO_MISC,

1808

"iconv_open failed : Cannot handle ``%s''",

1809

(flag & SCONV_TO_CHARSET)?tc:fc);

1810

#else

1811

archive_set_error(a, ARCHIVE_ERRNO_MISC,

1812

"A character-set conversion not fully supported "

1813

"on this platform");

1814

#endif

1815

}

1816

/* Failed; free a sconv object. */

1817

free_sconv_object(sc);

1818

return (NULL);

1819

}

1820

1821

1822

* Success!

1823

1824

if (a != NULL)

1825

add_sconv_object(a, sc);

1826

return (sc);

1827

}

1828

1829

static const char *

1830

get_current_charset(struct archive *a)

1831

{

1832

const char *cur_charset;

1833

1834

if (a == NULL)

1835

cur_charset = default_iconv_charset("");

1836

else {

1837

cur_charset = default_iconv_charset(a->current_code);

1838

if (a->current_code == NULL) {

1839

a->current_code = strdup(cur_charset);

1840

a->current_codepage = get_current_codepage();

1841

a->current_oemcp = get_current_oemcp();

1842

}

1843

}

1844

return (cur_charset);

1845

}

1846

1847

1848

* Make and Return a string conversion object.

1849

* Return NULL if the platform does not support the specified conversion

1850

* and best_effort is 0.

1851

* If best_effort is set, A string conversion object must be returned

1852

* unless memory allocation for the object fails, but the conversion

1853

* might fail when non-ASCII code is found.

1854

1855

struct archive_string_conv *

1856

archive_string_conversion_to_charset(struct archive *a, const char *charset,

1857

int best_effort)

1858

{

1859

int flag = SCONV_TO_CHARSET;

1860

1861

if (best_effort)

1862

flag |= SCONV_BEST_EFFORT;

1863

return (get_sconv_object(a, get_current_charset(a), charset, flag));

1864

}

1865

1866

struct archive_string_conv *

1867

archive_string_conversion_from_charset(struct archive *a, const char *charset,

1868

int best_effort)

1869

{

1870

int flag = SCONV_FROM_CHARSET;

1871

1872

if (best_effort)

1873

flag |= SCONV_BEST_EFFORT;

1874

return (get_sconv_object(a, charset, get_current_charset(a), flag));

1875

}

1876

1877

1878

* archive_string_default_conversion_*_archive() are provided for Windows

1879

* platform because other archiver application use CP_OEMCP for

1880

* MultiByteToWideChar() and WideCharToMultiByte() for the filenames

1881

* in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP

1882

* unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP).

1883

* So we should make a string conversion between CP_ACP and CP_OEMCP

1884

* for compatibillty.

1885

1886

#if defined(_WIN32) && !defined(__CYGWIN__)

1887

struct archive_string_conv *

1888

archive_string_default_conversion_for_read(struct archive *a)

1889

{

1890

const char *cur_charset = get_current_charset(a);

1891

char oemcp[16];

1892

1893

/* NOTE: a check of cur_charset is unneeded but we need

1894

* that get_current_charset() has been surely called at

1895

* this time whatever C compiler optimized. */

1896

if (cur_charset != NULL &&

1897

(a->current_codepage == CP_C_LOCALE ||

1898

a->current_codepage == a->current_oemcp))

1899

return (NULL);/* no conversion. */

1900

1901

_snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);

1902

/* Make sure a null termination must be set. */

1903

oemcp[sizeof(oemcp)-1] = '\0';

1904

return (get_sconv_object(a, oemcp, cur_charset,

1905

SCONV_FROM_CHARSET));

1906

}

1907

1908

struct archive_string_conv *

1909

archive_string_default_conversion_for_write(struct archive *a)

1910

{

1911

const char *cur_charset = get_current_charset(a);

1912

char oemcp[16];

1913

1914

/* NOTE: a check of cur_charset is unneeded but we need

1915

* that get_current_charset() has been surely called at

1916

* this time whatever C compiler optimized. */

1917

if (cur_charset != NULL &&

1918

(a->current_codepage == CP_C_LOCALE ||

1919

a->current_codepage == a->current_oemcp))

1920

return (NULL);/* no conversion. */

1921

1922

_snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp);

1923

/* Make sure a null termination must be set. */

1924

oemcp[sizeof(oemcp)-1] = '\0';

1925

return (get_sconv_object(a, cur_charset, oemcp,

1926

SCONV_TO_CHARSET));

1927

}

1928

#else

1929

struct archive_string_conv *

1930

archive_string_default_conversion_for_read(struct archive *a)

1931

{

1932

(void)a; /* UNUSED */

1933

return (NULL);

1934

}

1935

1936

struct archive_string_conv *

1937

archive_string_default_conversion_for_write(struct archive *a)

1938

{

1939

(void)a; /* UNUSED */

1940

return (NULL);

1941

}

1942

#endif

1943

1944

1945

* Dispose of all character conversion objects in the archive object.

1946

1947

void

1948

archive_string_conversion_free(struct archive *a)

1949

{

1950

struct archive_string_conv *sc;

1951

struct archive_string_conv *sc_next;

1952

1953

for (sc = a->sconv; sc != NULL; sc = sc_next) {

1954

sc_next = sc->next;

1955

free_sconv_object(sc);

1956

}

1957

a->sconv = NULL;

1958

free(a->current_code);

1959

a->current_code = NULL;

1960

}

1961

1962

1963

* Return a conversion charset name.

1964

1965

const char *

1966

archive_string_conversion_charset_name(struct archive_string_conv *sc)

1967

{

1968

if (sc->flag & SCONV_TO_CHARSET)

1969

return (sc->to_charset);

1970

else

1971

return (sc->from_charset);

1972

}

1973

1974

1975

* Change the behavior of a string conversion.

1976

1977

void

1978

archive_string_conversion_set_opt(struct archive_string_conv *sc, int opt)

1979

{

1980

switch (opt) {

1981

1982

* A filename in UTF-8 was made with libarchive 2.x in a wrong

1983

* assumption that wchar_t was Unicode.

1984

* This option enables simulating the assumption in order to read

1985

* that filname correctly.

1986

1987

case SCONV_SET_OPT_UTF8_LIBARCHIVE2X:

1988

#if (defined(_WIN32) && !defined(__CYGWIN__)) \

1989

|| defined(__STDC_ISO_10646__) || defined(__APPLE__)

1990

1991

* Nothing to do for it since wchar_t on these platforms

1992

* is really Unicode.

1993

1994

(void)sc; /* UNUSED */

1995

#else

1996

if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) {

1997

sc->flag |= SCONV_UTF8_LIBARCHIVE_2;

1998

/* Re-setup string converters. */

1999

setup_converter(sc);

2000

}

2001

#endif

2002

break;

2003

default:

2004

break;

2005

}

2006

}

2007

2008

2009

2010

* Copy one archive_string to another in locale conversion.

2011

2012

* archive_strncpy_in_locale();

2013

* archive_strcpy_in_locale();

2014

2015

2016

2017

static size_t

2018

mbsnbytes(const void *_p, size_t n)

2019

{

2020

size_t s;

2021

const char *p, *pp;

2022

2023

if (_p == NULL)

2024

return (0);

2025

p = (const char *)_p;

2026

2027

/* Like strlen(p), except won't examine positions beyond p[n]. */

2028

s = 0;

2029

pp = p;

2030

while (s < n && *pp) {

2031

pp++;

2032

s++;

2033

}

2034

return (s);

2035

}

2036

2037

static size_t

2038

utf16nbytes(const void *_p, size_t n)

2039

{

2040

size_t s;

2041

const char *p, *pp;

2042

2043

if (_p == NULL)

2044

return (0);

2045

p = (const char *)_p;

2046

2047

/* Like strlen(p), except won't examine positions beyond p[n]. */

2048

s = 0;

2049

pp = p;

2050

n >>= 1;

2051

while (s < n && (pp[0] || pp[1])) {

2052

pp += 2;

2053

s++;

2054

}

2055

return (s<<1);

2056

}

2057

2058

int

2059

archive_strncpy_in_locale(struct archive_string *as, const void *_p, size_t n,

2060

struct archive_string_conv *sc)

2061

{

2062

as->length = 0;

2063

return (archive_strncat_in_locale(as, _p, n, sc));

2064

}

2065

2066

int

2067

archive_strncat_in_locale(struct archive_string *as, const void *_p, size_t n,

2068

struct archive_string_conv *sc)

2069

{

2070

const void *s;

2071

size_t length;

2072

int i, r = 0, r2;

2073

2074

/* We must allocate memory even if there is no data for conversion

2075

* or copy. This simulates archive_string_append behavior. */

2076

if (_p == NULL || n == 0) {

2077

int tn = 1;

2078

if (sc != NULL && (sc->flag & SCONV_TO_UTF16))

2079

tn = 2;

2080

if (archive_string_ensure(as, as->length + tn) == NULL)

2081

return (-1);

2082

as->s[as->length] = 0;

2083

if (tn == 2)

2084

as->s[as->length+1] = 0;

2085

return (0);

2086

}

2087

2088

2089

* If sc is NULL, we just make a copy.

2090

2091

if (sc == NULL) {

2092

length = mbsnbytes(_p, n);

2093

if (archive_string_append(as, _p, length) == NULL)

2094

return (-1);/* No memory */

2095

return (0);

2096

}

2097

2098

if (sc->flag & SCONV_FROM_UTF16)

2099

length = utf16nbytes(_p, n);

2100

else

2101

length = mbsnbytes(_p, n);

2102

s = _p;

2103

i = 0;

2104

if (sc->nconverter > 1) {

2105

sc->utftmp.length = 0;

2106

r2 = sc->converter[0](&(sc->utftmp), s, length, sc);

2107

if (r2 != 0 && errno == ENOMEM)

2108

return (r2);

2109

if (r > r2)

2110

r = r2;

2111

s = sc->utftmp.s;

2112

length = sc->utftmp.length;

2113

++i;

2114

}

2115

r2 = sc->converter[i](as, s, length, sc);

2116

if (r > r2)

2117

r = r2;

2118

return (r);

2119

}

2120

2121

#if HAVE_ICONV

2122

2123

2124

* Return -1 if conversion failes.

2125

2126

static int

2127

iconv_strncat_in_locale(struct archive_string *as, const void *_p,

2128

size_t length, struct archive_string_conv *sc)

2129

{

2130

ICONV_CONST char *inp;

2131

size_t remaining;

2132

iconv_t cd;

2133

char *outp;

2134

size_t avail, bs;

2135

int return_value = 0; /* success */

2136

int to_size, from_size;

2137

2138

if (sc->flag & SCONV_TO_UTF16)

2139

to_size = 2;

2140

else

2141

to_size = 1;

2142

if (sc->flag & SCONV_FROM_UTF16)

2143

from_size = 2;

2144

else

2145

from_size = 1;

2146

2147

if (archive_string_ensure(as, as->length + length*2+to_size) == NULL)

2148

return (-1);

2149

2150

cd = sc->cd;

2151

inp = (char *)(uintptr_t)_p;

2152

remaining = length;

2153

outp = as->s + as->length;

2154

avail = as->buffer_length - as->length - to_size;

2155

while (remaining >= (size_t)from_size) {

2156

size_t result = iconv(cd, &inp, &remaining, &outp, &avail);

2157

2158

if (result != (size_t)-1)

2159

break; /* Conversion completed. */

2160

2161

if (errno == EILSEQ || errno == EINVAL) {

2162

2163

* If an output charset is UTF-8 or UTF-16BE/LE,

2164

* unknown character should be U+FFFD

2165

* (replacement character).

2166

2167

if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {

2168

size_t rbytes;

2169

if (sc->flag & SCONV_TO_UTF8)

2170

rbytes = UTF8_R_CHAR_SIZE;

2171

else

2172

rbytes = 2;

2173

2174

if (avail < rbytes) {

2175

as->length = outp - as->s;

2176

bs = as->buffer_length +

2177

(remaining * to_size) + rbytes;

2178

if (NULL ==

2179

archive_string_ensure(as, bs))

2180

return (-1);

2181

outp = as->s + as->length;

2182

avail = as->buffer_length

2183

- as->length - to_size;

2184

}

2185

if (sc->flag & SCONV_TO_UTF8)

2186

UTF8_SET_R_CHAR(outp);

2187

else if (sc->flag & SCONV_TO_UTF16BE)

2188

archive_be16enc(outp, UNICODE_R_CHAR);

2189

else

2190

archive_le16enc(outp, UNICODE_R_CHAR);

2191

outp += rbytes;

2192

avail -= rbytes;

2193

} else {

2194

/* Skip the illegal input bytes. */

2195

*outp++ = '?';

2196

avail--;

2197

}

2198

inp += from_size;

2199

remaining -= from_size;

2200

return_value = -1; /* failure */

2201

} else {

2202

/* E2BIG no output buffer,

2203

* Increase an output buffer. */

2204

as->length = outp - as->s;

2205

bs = as->buffer_length + remaining * 2;

2206

if (NULL == archive_string_ensure(as, bs))

2207

return (-1);

2208

outp = as->s + as->length;

2209

avail = as->buffer_length - as->length - to_size;

2210

}

2211

}

2212

as->length = outp - as->s;

2213

as->s[as->length] = 0;

2214

if (to_size == 2)

2215

as->s[as->length+1] = 0;

2216

return (return_value);

2217

}

2218

2219

#endif /* HAVE_ICONV */

2220

2221

2222

#if defined(_WIN32) && !defined(__CYGWIN__)

2223

2224

2225

* Translate a string from a some CodePage to an another CodePage by

2226

* Windows APIs, and copy the result. Return -1 if conversion failes.

2227

2228

static int

2229

strncat_in_codepage(struct archive_string *as,

2230

const void *_p, size_t length, struct archive_string_conv *sc)

2231

{

2232

const char *s = (const char *)_p;

2233

struct archive_wstring aws;

2234

size_t l;

2235

int r, saved_flag;

2236

2237

archive_string_init(&aws);

2238

saved_flag = sc->flag;

2239

sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C);

2240

r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc);

2241

sc->flag = saved_flag;

2242

if (r != 0) {

2243

archive_wstring_free(&aws);

2244

if (errno != ENOMEM)

2245

archive_string_append(as, s, length);

2246

return (-1);

2247

}

2248

2249

l = as->length;

2250

r = archive_string_append_from_wcs_in_codepage(

2251

as, aws.s, aws.length, sc);

2252

if (r != 0 && errno != ENOMEM && l == as->length)

2253

archive_string_append(as, s, length);

2254

archive_wstring_free(&aws);

2255

return (r);

2256

}

2257

2258

2259

* Test whether MBS ==> WCS is okay.

2260

2261

static int

2262

invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)

2263

{

2264

const char *p = (const char *)_p;

2265

unsigned codepage;

2266

DWORD mbflag = MB_ERR_INVALID_CHARS;

2267

2268

if (sc->flag & SCONV_FROM_CHARSET)

2269

codepage = sc->to_cp;

2270

else

2271

codepage = sc->from_cp;

2272

2273

if (codepage == CP_C_LOCALE)

2274

return (0);

2275

if (codepage != CP_UTF8)

2276

mbflag |= MB_PRECOMPOSED;

2277

2278

if (MultiByteToWideChar(codepage, mbflag, p, n, NULL, 0) == 0)

2279

return (-1); /* Invalid */

2280

return (0); /* Okay */

2281

}

2282

2283

#else

2284

2285

2286

* Test whether MBS ==> WCS is okay.

2287

2288

static int

2289

invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc)

2290

{

2291

const char *p = (const char *)_p;

2292

size_t r;

2293

2294

(void)sc; /* UNUSED */

2295

#if HAVE_MBRTOWC

2296

mbstate_t shift_state;

2297

2298

memset(&shift_state, 0, sizeof(shift_state));

2299

#else

2300

/* Clear the shift state before starting. */

2301

mbtowc(NULL, NULL, 0);

2302

#endif

2303

while (n) {

2304

wchar_t wc;

2305

2306

#if HAVE_MBRTOWC

2307

r = mbrtowc(&wc, p, n, &shift_state);

2308

#else

2309

r = mbtowc(&wc, p, n);

2310

#endif

2311

if (r == (size_t)-1 || r == (size_t)-2)

2312

return (-1);/* Invalid. */

2313

if (r == 0)

2314

break;

2315

p += r;

2316

n -= r;

2317

}

2318

return (0); /* All Okey. */

2319

}

2320

2321

#endif /* defined(_WIN32) && !defined(__CYGWIN__) */

2322

2323

2324

* Basically returns -1 because we cannot make a conversion of charset

2325

* without iconv but in some cases this would return 0.

2326

* Returns 0 if all copied characters are ASCII.

2327

* Returns 0 if both from-locale and to-locale are the same and those

2328

* can be WCS with no error.

2329

2330

static int

2331

best_effort_strncat_in_locale(struct archive_string *as, const void *_p,

2332

size_t length, struct archive_string_conv *sc)

2333

{

2334

size_t remaining;

2335

char *outp;

2336

const char *inp;

2337

size_t avail;

2338

int return_value = 0; /* success */

2339

2340

2341

* If both from-locale and to-locale is the same, this makes a copy.

2342

* And then this checks all copied MBS can be WCS if so returns 0.

2343

2344

if (sc->same) {

2345

if (archive_string_append(as, _p, length) == NULL)

2346

return (-1);/* No memory */

2347

return (invalid_mbs(_p, length, sc));

2348

}

2349

2350

2351

* If a character is ASCII, this just copies it. If not, this

2352

* assigns '?' charater instead but in UTF-8 locale this assigns

2353

* byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,

2354

* a Replacement Character in Unicode.

2355

2356

if (archive_string_ensure(as, as->length + length + 1) == NULL)

2357

return (-1);

2358

2359

remaining = length;

2360

inp = (const char *)_p;

2361

outp = as->s + as->length;

2362

avail = as->buffer_length - as->length -1;

2363

while (*inp && remaining > 0) {

2364

if (*inp < 0 && (sc->flag & SCONV_TO_UTF8)) {

2365

if (avail < UTF8_R_CHAR_SIZE) {

2366

as->length = outp - as->s;

2367

if (NULL == archive_string_ensure(as,

2368

as->buffer_length + remaining +

2369

UTF8_R_CHAR_SIZE))

2370

return (-1);

2371

outp = as->s + as->length;

2372

avail = as->buffer_length - as->length -1;

2373

}

2374

2375

* When coping a string in UTF-8, unknown character

2376

* should be U+FFFD (replacement character).

2377

2378

UTF8_SET_R_CHAR(outp);

2379

outp += UTF8_R_CHAR_SIZE;

2380

avail -= UTF8_R_CHAR_SIZE;

2381

inp++;

2382

remaining--;

2383

return_value = -1;

2384

} else if (*inp < 0) {

2385

*outp++ = '?';

2386

inp++;

2387

remaining--;

2388

return_value = -1;

2389

} else {

2390

*outp++ = *inp++;

2391

remaining--;

2392

}

2393

}

2394

as->length = outp - as->s;

2395

as->s[as->length] = '\0';

2396

return (return_value);

2397

}

2398

2399

2400

2401

* Unicode conversion functions.

2402

* - UTF-8 <===> UTF-8 in removing surrogate pairs.

2403

* - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs.

2404

* - UTF-8 made by libarchive 2.x ===> UTF-8.

2405

* - UTF-16BE <===> UTF-8.

2406

2407

2408

2409

2410

* Utility to convert a single UTF-8 sequence.

2411

2412

* Usually return used bytes, return used byte in negative value when

2413

* a unicode character is replaced with U+FFFD.

2414

* See also http://unicode.org/review/pr-121.html Public Review Issue #121

2415

* Recommended Practice for Replacement Characters.

2416

2417

static int

2418

_utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)

2419

{

2420

static const char utf8_count[256] = {

2421

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */

2422

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */

2423

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */

2424

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */

2425

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */

2426

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */

2427

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */

2428

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */

2429

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */

2430

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */

2431

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */

2432

0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */

2433

0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */

2434

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */

2435

3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */

2436

4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */

2437

};

2438

int ch, i;

2439

int cnt;

2440

uint32_t wc;

2441

2442

/* Sanity check. */

2443

if (n == 0)

2444

return (0);

2445

2446

* Decode 1-4 bytes depending on the value of the first byte.

2447

2448

ch = (unsigned char)*s;

2449

if (ch == 0)

2450

return (0); /* Standard: return 0 for end-of-string. */

2451

cnt = utf8_count[ch];

2452

2453

/* Invalide sequence or there are not plenty bytes. */

2454

if ((int)n < cnt) {

2455

cnt = n;

2456

for (i = 1; i < cnt; i++) {

2457

if ((s[i] & 0xc0) != 0x80) {

2458

cnt = i;

2459

break;

2460

}

2461

}

2462

goto invalid_sequence;

2463

}

2464

2465

/* Make a Unicode code point from a single UTF-8 sequence. */

2466

switch (cnt) {

2467

case 1: /* 1 byte sequence. */

2468

*pwc = ch & 0x7f;

2469

return (cnt);

2470

case 2: /* 2 bytes sequence. */

2471

if ((s[1] & 0xc0) != 0x80) {

2472

cnt = 1;

2473

goto invalid_sequence;

2474

}

2475

*pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f);

2476

return (cnt);

2477

case 3: /* 3 bytes sequence. */

2478

if ((s[1] & 0xc0) != 0x80) {

2479

cnt = 1;

2480

goto invalid_sequence;

2481

}

2482

if ((s[2] & 0xc0) != 0x80) {

2483

cnt = 2;

2484

goto invalid_sequence;

2485

}

2486

wc = ((ch & 0x0f) << 12)

2487

| ((s[1] & 0x3f) << 6)

2488

| (s[2] & 0x3f);

2489

if (wc < 0x800)

2490

goto invalid_sequence;/* Overlong sequence. */

2491

break;

2492

case 4: /* 4 bytes sequence. */

2493

if ((s[1] & 0xc0) != 0x80) {

2494

cnt = 1;

2495

goto invalid_sequence;

2496

}

2497

if ((s[2] & 0xc0) != 0x80) {

2498

cnt = 2;

2499

goto invalid_sequence;

2500

}

2501

if ((s[3] & 0xc0) != 0x80) {

2502

cnt = 3;

2503

goto invalid_sequence;

2504

}

2505

wc = ((ch & 0x07) << 18)

2506

| ((s[1] & 0x3f) << 12)

2507

| ((s[2] & 0x3f) << 6)

2508

| (s[3] & 0x3f);

2509

if (wc < 0x10000)

2510

goto invalid_sequence;/* Overlong sequence. */

2511

break;

2512

default: /* Others are all invalid sequence. */

2513

if (ch == 0xc0 || ch == 0xc1)

2514

cnt = 2;

2515

else if (ch >= 0xf5 && ch <= 0xf7)

2516

cnt = 4;

2517

else if (ch >= 0xf8 && ch <= 0xfb)

2518

cnt = 5;

2519

else if (ch == 0xfc || ch == 0xfd)

2520

cnt = 6;

2521

else

2522

cnt = 1;

2523

if ((int)n < cnt)

2524

cnt = n;

2525

for (i = 1; i < cnt; i++) {

2526

if ((s[i] & 0xc0) != 0x80) {

2527

cnt = i;

2528

break;

2529

}

2530

}

2531

goto invalid_sequence;

2532

}

2533

2534

/* The code point larger than 0x10FFFF is not leagal

2535

* Unicode values. */

2536

if (wc > UNICODE_MAX)

2537

goto invalid_sequence;

2538

/* Correctly gets a Unicode, returns used bytes. */

2539

*pwc = wc;

2540

return (cnt);

2541

invalid_sequence:

2542

*pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */

2543

return (cnt * -1);

2544

}

2545

2546

static int

2547

utf8_to_unicode(uint32_t *pwc, const char *s, size_t n)

2548

{

2549

int cnt;

2550

2551

cnt = _utf8_to_unicode(pwc, s, n);

2552

/* Any of Surrogate pair is not leagal Unicode values. */

2553

if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc))

2554

return (-3);

2555

return (cnt);

2556

}

2557

2558

static inline uint32_t

2559

combine_surrogate_pair(uint32_t uc, uint32_t uc2)

2560

{

2561

uc -= 0xD800;

2562

uc *= 0x400;

2563

uc += uc2 - 0xDC00;

2564

uc += 0x10000;

2565

return (uc);

2566

}

2567

2568

2569

* Convert a single UTF-8/CESU-8 sequence to a Unicode code point in

2570

* removing surrogate pairs.

2571

2572

* CESU-8: The Compatibility Encoding Scheme for UTF-16.

2573

2574

* Usually return used bytes, return used byte in negative value when

2575

* a unicode character is replaced with U+FFFD.

2576

2577

static int

2578

cesu8_to_unicode(uint32_t *pwc, const char *s, size_t n)

2579

{

2580

uint32_t wc, wc2;

2581

int cnt;

2582

2583

cnt = _utf8_to_unicode(&wc, s, n);

2584

if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) {

2585

if (n - 3 < 3) {

2586

/* Invalid byte sequence. */

2587

goto invalid_sequence;

2588

}

2589

cnt = _utf8_to_unicode(&wc2, s+3, n-3);

2590

if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) {

2591

/* Invalid byte sequence. */

2592

goto invalid_sequence;

2593

}

2594

wc = combine_surrogate_pair(wc, wc2);

2595

cnt = 6;

2596

} else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) {

2597

/* Invalid byte sequence. */

2598

goto invalid_sequence;

2599

}

2600

*pwc = wc;

2601

return (cnt);

2602

invalid_sequence:

2603

*pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */

2604

if (cnt > 0)

2605

cnt *= -1;

2606

return (cnt);

2607

}

2608

2609

2610

* Convert a Unicode code point to a single UTF-8 sequence.

2611

2612

* NOTE:This function does not check if the Unicode is leagal or not.

2613

* Please you definitely check it before calling this.

2614

2615

static size_t

2616

unicode_to_utf8(char *p, size_t remaining, uint32_t uc)

2617

{

2618

char *_p = p;

2619

2620

/* Translate code point to UTF8 */

2621

if (uc <= 0x7f) {

2622

if (remaining == 0)

2623

return (0);

2624

*p++ = (char)uc;

2625

} else if (uc <= 0x7ff) {

2626

if (remaining < 2)

2627

return (0);

2628

*p++ = 0xc0 | ((uc >> 6) & 0x1f);

2629

*p++ = 0x80 | (uc & 0x3f);

2630

} else if (uc <= 0xffff) {

2631

if (remaining < 3)

2632

return (0);

2633

*p++ = 0xe0 | ((uc >> 12) & 0x0f);

2634

*p++ = 0x80 | ((uc >> 6) & 0x3f);

2635

*p++ = 0x80 | (uc & 0x3f);

2636

} else if (uc <= UNICODE_MAX) {

2637

if (remaining < 4)

2638

return (0);

2639

*p++ = 0xf0 | ((uc >> 18) & 0x07);

2640

*p++ = 0x80 | ((uc >> 12) & 0x3f);

2641

*p++ = 0x80 | ((uc >> 6) & 0x3f);

2642

*p++ = 0x80 | (uc & 0x3f);

2643

} else {

2644

2645

* Undescribed code point should be U+FFFD

2646

* (replacement character).

2647

2648

if (remaining < UTF8_R_CHAR_SIZE)

2649

return (0);

2650

UTF8_SET_R_CHAR(p);

2651

p += UTF8_R_CHAR_SIZE;

2652

}

2653

return (p - _p);

2654

}

2655

2656

static int

2657

utf16be_to_unicode(uint32_t *pwc, const char *s, size_t n)

2658

{

2659

return (utf16_to_unicode(pwc, s, n, 1));

2660

}

2661

2662

static int

2663

utf16le_to_unicode(uint32_t *pwc, const char *s, size_t n)

2664

{

2665

return (utf16_to_unicode(pwc, s, n, 0));

2666

}

2667

2668

static int

2669

utf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be)

2670

{

2671

const char *utf16 = s;

2672

unsigned uc;

2673

2674

if (n == 0)

2675

return (0);

2676

if (n == 1) {

2677

/* set the Replacement Character instead. */

2678

*pwc = UNICODE_R_CHAR;

2679

return (-1);

2680

}

2681

2682

if (be)

2683

uc = archive_be16dec(utf16);

2684

else

2685

uc = archive_le16dec(utf16);

2686

utf16 += 2;

2687

2688

/* If this is a surrogate pair, assemble the full code point.*/

2689

if (IS_HIGH_SURROGATE_LA(uc)) {

2690

unsigned uc2;

2691

2692

if (n >= 4) {

2693

if (be)

2694

uc2 = archive_be16dec(utf16);

2695

else

2696

uc2 = archive_le16dec(utf16);

2697

} else

2698

uc2 = 0;

2699

if (IS_LOW_SURROGATE_LA(uc2)) {

2700

uc = combine_surrogate_pair(uc, uc2);

2701

utf16 += 2;

2702

} else {

2703

/* Undescribed code point should be U+FFFD

2704

* (replacement character). */

2705

*pwc = UNICODE_R_CHAR;

2706

return (-2);

2707

}

2708

}

2709

2710

2711

* Surrogate pair values(0xd800 through 0xdfff) are only

2712

* used by UTF-16, so, after above culculation, the code

2713

* must not be surrogate values, and Unicode has no codes

2714

* larger than 0x10ffff. Thus, those are not leagal Unicode

2715

* values.

2716

2717

if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) {

2718

/* Undescribed code point should be U+FFFD

2719

* (replacement character). */

2720

*pwc = UNICODE_R_CHAR;

2721

return (((int)(utf16 - s)) * -1);

2722

}

2723

*pwc = uc;

2724

return ((int)(utf16 - s));

2725

}

2726

2727

static size_t

2728

unicode_to_utf16be(char *p, size_t remaining, uint32_t uc)

2729

{

2730

char *utf16 = p;

2731

2732

if (uc > 0xffff) {

2733

/* We have a code point that won't fit into a

2734

* wchar_t; convert it to a surrogate pair. */

2735

if (remaining < 4)

2736

return (0);

2737

uc -= 0x10000;

2738

archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);

2739

archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);

2740

return (4);

2741

} else {

2742

if (remaining < 2)

2743

return (0);

2744

archive_be16enc(utf16, uc);

2745

return (2);

2746

}

2747

}

2748

2749

static size_t

2750

unicode_to_utf16le(char *p, size_t remaining, uint32_t uc)

2751

{

2752

char *utf16 = p;

2753

2754

if (uc > 0xffff) {

2755

/* We have a code point that won't fit into a

2756

* wchar_t; convert it to a surrogate pair. */

2757

if (remaining < 4)

2758

return (0);

2759

uc -= 0x10000;

2760

archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);

2761

archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);

2762

return (4);

2763

} else {

2764

if (remaining < 2)

2765

return (0);

2766

archive_le16enc(utf16, uc);

2767

return (2);

2768

}

2769

}

2770

2771

2772

* Copy UTF-8 string in checking surrogate pair.

2773

* If any surrogate pair are found, it would be canonicalized.

2774

2775

static int

2776

strncat_from_utf8_to_utf8(struct archive_string *as, const void *_p, size_t len,

2777

struct archive_string_conv *sc)

2778

{

2779

const char *s;

2780

char *p, *endp;

2781

int n, ret = 0;

2782

2783

(void)sc; /* UNUSED */

2784

2785

if (archive_string_ensure(as, as->length + len + 1) == NULL)

2786

return (-1);

2787

2788

s = (const char *)_p;

2789

p = as->s + as->length;

2790

endp = as->s + as->buffer_length -1;

2791

do {

2792

uint32_t uc;

2793

const char *ss = s;

2794

size_t w;

2795

2796

2797

* Forward byte sequence until a conversion of that is needed.

2798

2799

while ((n = utf8_to_unicode(&uc, s, len)) > 0) {

2800

s += n;

2801

len -= n;

2802

}

2803

if (ss < s) {

2804

if (p + (s - ss) > endp) {

2805

as->length = p - as->s;

2806

if (archive_string_ensure(as,

2807

as->buffer_length + len + 1) == NULL)

2808

return (-1);

2809

p = as->s + as->length;

2810

endp = as->s + as->buffer_length -1;

2811

}

2812

2813

memcpy(p, ss, s - ss);

2814

p += s - ss;

2815

}

2816

2817

2818

* If n is negative, current byte sequence needs a replacement.

2819

2820

if (n < 0) {

2821

if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) {

2822

/* Current byte sequence may be CESU-8. */

2823

n = cesu8_to_unicode(&uc, s, len);

2824

}

2825

if (n < 0) {

2826

ret = -1;

2827

n *= -1;/* Use a replaced unicode character. */

2828

}

2829

2830

/* Rebuild UTF-8 byte sequence. */

2831

while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) {

2832

as->length = p - as->s;

2833

if (archive_string_ensure(as,

2834

as->buffer_length + len + 1) == NULL)

2835

return (-1);

2836

p = as->s + as->length;

2837

endp = as->s + as->buffer_length -1;

2838

}

2839

p += w;

2840

s += n;

2841

len -= n;

2842

}

2843

} while (n > 0);

2844

as->length = p - as->s;

2845

as->s[as->length] = '\0';

2846

return (ret);

2847

}

2848

2849

static int

2850

archive_string_append_unicode(struct archive_string *as, const void *_p,

2851

size_t len, struct archive_string_conv *sc)

2852

{

2853

const char *s;

2854

char *p, *endp;

2855

uint32_t uc;

2856

size_t w;

2857

int n, ret = 0, ts, tm;

2858

int (*parse)(uint32_t *, const char *, size_t);

2859

size_t (*unparse)(char *, size_t, uint32_t);

2860

2861

if (sc->flag & SCONV_TO_UTF16BE) {

2862

unparse = unicode_to_utf16be;

2863

ts = 2;

2864

} else if (sc->flag & SCONV_TO_UTF16LE) {

2865

unparse = unicode_to_utf16le;

2866

ts = 2;

2867

} else if (sc->flag & SCONV_TO_UTF8) {

2868

unparse = unicode_to_utf8;

2869

ts = 1;

2870

} else {

2871

2872

* This case is going to be converted to another

2873

* character-set through iconv.

2874

2875

if (sc->flag & SCONV_FROM_UTF16BE) {

2876

unparse = unicode_to_utf16be;

2877

ts = 2;

2878

} else if (sc->flag & SCONV_FROM_UTF16LE) {

2879

unparse = unicode_to_utf16le;

2880

ts = 2;

2881

} else {

2882

unparse = unicode_to_utf8;

2883

ts = 1;

2884

}

2885

}

2886

2887

if (sc->flag & SCONV_FROM_UTF16BE) {

2888

parse = utf16be_to_unicode;

2889

tm = 1;

2890

} else if (sc->flag & SCONV_FROM_UTF16LE) {

2891

parse = utf16le_to_unicode;

2892

tm = 1;

2893

} else {

2894

parse = cesu8_to_unicode;

2895

tm = ts;

2896

}

2897

2898

if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)

2899

return (-1);

2900

2901

s = (const char *)_p;

2902

p = as->s + as->length;

2903

endp = as->s + as->buffer_length - ts;

2904

while ((n = parse(&uc, s, len)) != 0) {

2905

if (n < 0) {

2906

/* Use a replaced unicode character. */

2907

n *= -1;

2908

ret = -1;

2909

}

2910

s += n;

2911

len -= n;

2912

while ((w = unparse(p, endp - p, uc)) == 0) {

2913

/* There is not enough output buffer so

2914

* we have to expand it. */

2915

as->length = p - as->s;

2916

if (archive_string_ensure(as,

2917

as->buffer_length + len * tm + ts) == NULL)

2918

return (-1);

2919

p = as->s + as->length;

2920

endp = as->s + as->buffer_length - ts;

2921

}

2922

p += w;

2923

}

2924

as->length = p - as->s;

2925

as->s[as->length] = '\0';

2926

if (ts == 2)

2927

as->s[as->length+1] = '\0';

2928

return (ret);

2929

}

2930

2931

2932

* Following Constants for Hangul compositions this information comes from

2933

* Unicode Standard Annex #15 http://unicode.org/reports/tr15/

2934

2935

#define HC_SBASE 0xAC00

2936

#define HC_LBASE 0x1100

2937

#define HC_VBASE 0x1161

2938

#define HC_TBASE 0x11A7

2939

#define HC_LCOUNT 19

2940

#define HC_VCOUNT 21

2941

#define HC_TCOUNT 28

2942

#define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT)

2943

#define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT)

2944

2945

static uint32_t

2946

get_nfc(uint32_t uc, uint32_t uc2)

2947

{

2948

int t, b;

2949

2950

t = 0;

2951

b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1;

2952

while (b >= t) {

2953

int m = (t + b) / 2;

2954

if (u_composition_table[m].cp1 < uc)

2955

t = m + 1;

2956

else if (u_composition_table[m].cp1 > uc)

2957

b = m - 1;

2958

else if (u_composition_table[m].cp2 < uc2)

2959

t = m + 1;

2960

else if (u_composition_table[m].cp2 > uc2)

2961

b = m - 1;

2962

else

2963

return (u_composition_table[m].nfc);

2964

}

2965

return (0);

2966

}

2967

2968

#define FDC_MAX 10 /* The maximum number of Following Decomposable

2969

* Characters. */

2970

2971

2972

* Update first code point.

2973

2974

#define UPDATE_UC(new_uc) do { \

2975

uc = new_uc; \

2976

ucptr = NULL; \

2977

} while (0)

2978

2979

2980

* Replace first code point with second code point.

2981

2982

#define REPLACE_UC_WITH_UC2() do { \

2983

uc = uc2; \

2984

ucptr = uc2ptr; \

2985

n = n2; \

2986

} while (0)

2987

2988

#define EXPAND_BUFFER() do { \

2989

as->length = p - as->s; \

2990

if (archive_string_ensure(as, \

2991

as->buffer_length + len * tm + ts) == NULL)\

2992

return (-1); \

2993

p = as->s + as->length; \

2994

endp = as->s + as->buffer_length - ts; \

2995

} while (0)

2996

2997

#define UNPARSE(p, endp, uc) do { \

2998

while ((w = unparse(p, (endp) - (p), uc)) == 0) {\

2999

EXPAND_BUFFER(); \

3000

} \

3001

p += w; \

3002

} while (0)

3003

3004

3005

* Write first code point.

3006

* If the code point has not be changed from its original code,

3007

* this just copies it from its original buffer pointer.

3008

* If not, this converts it to UTF-8 byte sequence and copies it.

3009

3010

#define WRITE_UC() do { \

3011

if (ucptr) { \

3012

if (p + n > endp) \

3013

EXPAND_BUFFER(); \

3014

switch (n) { \

3015

case 4: \

3016

*p++ = *ucptr++; \

3017

/* FALL THROUGH */ \

3018

case 3: \

3019

*p++ = *ucptr++; \

3020

/* FALL THROUGH */ \

3021

case 2: \

3022

*p++ = *ucptr++; \

3023

/* FALL THROUGH */ \

3024

case 1: \

3025

*p++ = *ucptr; \

3026

break; \

3027

} \

3028

ucptr = NULL; \

3029

} else { \

3030

UNPARSE(p, endp, uc); \

3031

} \

3032

} while (0)

3033

3034

3035

* Collect following decomposable code points.

3036

3037

#define COLLECT_CPS(start) do { \

3038

int _i; \

3039

for (_i = start; _i < FDC_MAX ; _i++) { \

3040

nx = parse(&ucx[_i], s, len); \

3041

if (nx <= 0) \

3042

break; \

3043

cx = CCC(ucx[_i]); \

3044

if (cl >= cx && cl != 228 && cx != 228)\

3045

break; \

3046

s += nx; \

3047

len -= nx; \

3048

cl = cx; \

3049

ccx[_i] = cx; \

3050

} \

3051

if (_i >= FDC_MAX) { \

3052

ret = -1; \

3053

ucx_size = FDC_MAX; \

3054

} else \

3055

ucx_size = _i; \

3056

} while (0)

3057

3058

3059

* Normalize UTF-8/UTF-16BE characters to Form C and copy the result.

3060

3061

* TODO: Convert composition exclusions,which are never converted

3062

* from NFC,NFD,NFKC and NFKD, to Form C.

3063

3064

static int

3065

archive_string_normalize_C(struct archive_string *as, const void *_p,

3066

size_t len, struct archive_string_conv *sc)

3067

{

3068

const char *s = (const char *)_p;

3069

char *p, *endp;

3070

uint32_t uc, uc2;

3071

size_t w;

3072

int always_replace, n, n2, ret = 0, spair, ts, tm;

3073

int (*parse)(uint32_t *, const char *, size_t);

3074

size_t (*unparse)(char *, size_t, uint32_t);

3075

3076

always_replace = 1;

3077

ts = 1;/* text size. */

3078

if (sc->flag & SCONV_TO_UTF16BE) {

3079

unparse = unicode_to_utf16be;

3080

ts = 2;

3081

if (sc->flag & SCONV_FROM_UTF16BE)

3082

always_replace = 0;

3083

} else if (sc->flag & SCONV_TO_UTF16LE) {

3084

unparse = unicode_to_utf16le;

3085

ts = 2;

3086

if (sc->flag & SCONV_FROM_UTF16LE)

3087

always_replace = 0;

3088

} else if (sc->flag & SCONV_TO_UTF8) {

3089

unparse = unicode_to_utf8;

3090

if (sc->flag & SCONV_FROM_UTF8)

3091

always_replace = 0;

3092

} else {

3093

3094

* This case is going to be converted to another

3095

* character-set through iconv.

3096

3097

always_replace = 0;

3098

if (sc->flag & SCONV_FROM_UTF16BE) {

3099

unparse = unicode_to_utf16be;

3100

ts = 2;

3101

} else if (sc->flag & SCONV_FROM_UTF16LE) {

3102

unparse = unicode_to_utf16le;

3103

ts = 2;

3104

} else {

3105

unparse = unicode_to_utf8;

3106

}

3107

}

3108

3109

if (sc->flag & SCONV_FROM_UTF16BE) {

3110

parse = utf16be_to_unicode;

3111

tm = 1;

3112

spair = 4;/* surrogate pair size in UTF-16. */

3113

} else if (sc->flag & SCONV_FROM_UTF16LE) {

3114

parse = utf16le_to_unicode;

3115

tm = 1;

3116

spair = 4;/* surrogate pair size in UTF-16. */

3117

} else {

3118

parse = cesu8_to_unicode;

3119

tm = ts;

3120

spair = 6;/* surrogate pair size in UTF-8. */

3121

}

3122

3123

if (archive_string_ensure(as, as->length + len * tm + ts) == NULL)

3124

return (-1);

3125

3126

p = as->s + as->length;

3127

endp = as->s + as->buffer_length - ts;

3128

while ((n = parse(&uc, s, len)) != 0) {

3129

const char *ucptr, *uc2ptr;

3130

3131

if (n < 0) {

3132

/* Use a replaced unicode character. */

3133

UNPARSE(p, endp, uc);

3134

s += n*-1;

3135

len -= n*-1;

3136

ret = -1;

3137

continue;

3138

} else if (n == spair || always_replace)

3139

/* uc is converted from a surrogate pair.

3140

* this should be treated as a changed code. */

3141

ucptr = NULL;

3142

else

3143

ucptr = s;

3144

s += n;

3145

len -= n;

3146

3147

/* Read second code point. */

3148

while ((n2 = parse(&uc2, s, len)) > 0) {

3149

uint32_t ucx[FDC_MAX];

3150

int ccx[FDC_MAX];

3151

int cl, cx, i, nx, ucx_size;

3152

int LIndex,SIndex;

3153

uint32_t nfc;

3154

3155

if (n2 == spair || always_replace)

3156

/* uc2 is converted from a surrogate pair.

3157

* this should be treated as a changed code. */

3158

uc2ptr = NULL;

3159

else

3160

uc2ptr = s;

3161

s += n2;

3162

len -= n2;

3163

3164

3165

* If current second code point is out of decomposable

3166

* code points, finding compositions is unneeded.

3167

3168

if (!IS_DECOMPOSABLE_BLOCK(uc2)) {

3169

WRITE_UC();

3170

REPLACE_UC_WITH_UC2();

3171

continue;

3172

}

3173

3174

3175

* Try to combine current code points.

3176

3177

3178

* We have to combine Hangul characters according to

3179

* http://uniicode.org/reports/tr15/#Hangul

3180

3181

if (0 <= (LIndex = uc - HC_LBASE) &&

3182

LIndex < HC_LCOUNT) {

3183

3184

* Hangul Composition.

3185

* 1. Two current code points are L and V.

3186

3187

int VIndex = uc2 - HC_VBASE;

3188

if (0 <= VIndex && VIndex < HC_VCOUNT) {

3189

/* Make syllable of form LV. */

3190

UPDATE_UC(HC_SBASE +

3191

(LIndex * HC_VCOUNT + VIndex) *

3192

HC_TCOUNT);

3193

} else {

3194

WRITE_UC();

3195

REPLACE_UC_WITH_UC2();

3196

}

3197

continue;

3198

} else if (0 <= (SIndex = uc - HC_SBASE) &&

3199

SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) {

3200

3201

* Hangul Composition.

3202

* 2. Two current code points are LV and T.

3203

3204

int TIndex = uc2 - HC_TBASE;

3205

if (0 < TIndex && TIndex < HC_TCOUNT) {

3206

/* Make syllable of form LVT. */

3207

UPDATE_UC(uc + TIndex);

3208

} else {

3209

WRITE_UC();

3210

REPLACE_UC_WITH_UC2();

3211

}

3212

continue;

3213

} else if ((nfc = get_nfc(uc, uc2)) != 0) {

3214

/* A composition to current code points

3215

* is found. */

3216

UPDATE_UC(nfc);

3217

continue;

3218

} else if ((cl = CCC(uc2)) == 0) {

3219

/* Clearly 'uc2' the second code point is not

3220

* a decomposable code. */

3221

WRITE_UC();

3222

REPLACE_UC_WITH_UC2();

3223

continue;

3224

}

3225

3226

3227

* Collect following decomposable code points.

3228

3229

cx = 0;

3230

ucx[0] = uc2;

3231

ccx[0] = cl;

3232

COLLECT_CPS(1);

3233

3234

3235

* Find a composed code in the collected code points.

3236

3237

i = 1;

3238

while (i < ucx_size) {

3239

int j;

3240

3241

if ((nfc = get_nfc(uc, ucx[i])) == 0) {

3242

i++;

3243

continue;

3244

}

3245

3246

3247

* nfc is composed of uc and ucx[i].

3248

3249

UPDATE_UC(nfc);

3250

3251

3252

* Remove ucx[i] by shifting

3253

* following code points.

3254

3255

for (j = i; j+1 < ucx_size; j++) {

3256

ucx[j] = ucx[j+1];

3257

ccx[j] = ccx[j+1];

3258

}

3259

ucx_size --;

3260

3261

3262

* Collect following code points blocked

3263

* by ucx[i] the removed code point.

3264

3265

if (ucx_size > 0 && i == ucx_size &&

3266

nx > 0 && cx == cl) {

3267

cl = ccx[ucx_size-1];

3268

COLLECT_CPS(ucx_size);

3269

}

3270

3271

* Restart finding a composed code with

3272

* the updated uc from the top of the

3273

* collected code points.

3274

3275

i = 0;

3276

}

3277

3278

3279

* Apparently the current code points are not

3280

* decomposed characters or already composed.

3281

3282

WRITE_UC();

3283

for (i = 0; i < ucx_size; i++)

3284

UNPARSE(p, endp, ucx[i]);

3285

3286

3287

* Flush out remaining canonical combining characters.

3288

3289

if (nx > 0 && cx == cl && len > 0) {

3290

while ((nx = parse(&ucx[0], s, len))

3291

> 0) {

3292

cx = CCC(ucx[0]);

3293

if (cl > cx)

3294

break;

3295

s += nx;

3296

len -= nx;

3297

cl = cx;

3298

UNPARSE(p, endp, ucx[0]);

3299

}

3300

}

3301

break;

3302

}

3303

if (n2 < 0) {

3304

WRITE_UC();

3305

/* Use a replaced unicode character. */

3306

UNPARSE(p, endp, uc2);

3307

s += n2*-1;

3308

len -= n2*-1;

3309

ret = -1;

3310

continue;

3311

} else if (n2 == 0) {

3312

WRITE_UC();

3313

break;

3314

}

3315

}

3316

as->length = p - as->s;

3317

as->s[as->length] = '\0';

3318

if (ts == 2)

3319

as->s[as->length+1] = '\0';

3320

return (ret);

3321

}

3322

3323

#if defined(__APPLE__)

3324

3325

3326

* Normalize UTF-8 characters to Form D and copy the result.

3327

3328

static int

3329

archive_string_normalize_D(struct archive_string *as, const void *_p,

3330

size_t len, struct archive_string_conv *sc)

3331

{

3332

const UniChar *inp;

3333

char *outp;

3334

size_t newsize;

3335

ByteCount inCount, outCount;

3336

ByteCount inAvail, outAvail;

3337

OSStatus err;

3338

int ret, saved_flag;

3339

3340

3341

* Convert the current string to UTF-16LE for normalization.

3342

* The character-set of the current string must be UTF-16BE or

3343

* UTF-8.

3344

3345

archive_string_empty(&(sc->utf16nfc));

3346

saved_flag = sc->flag;/* save a flag. */

3347

sc->flag &= ~(SCONV_TO_UTF16BE | SCONV_TO_UTF8);

3348

sc->flag |= SCONV_TO_UTF16LE;

3349

ret = archive_string_append_unicode(&(sc->utf16nfc), _p, len, sc);

3350

sc->flag = saved_flag;/* restore the saved flag */

3351

if (archive_strlen(&(sc->utf16nfc)) == 0) {

3352

if (archive_string_ensure(as, as->length + 1) == NULL)

3353

return (-1);

3354

return (ret);

3355

}

3356

3357

3358

* Normalize an NFC string to be an NFD(HFS Plus version).

3359

3360

newsize = sc->utf16nfc.length + 2;

3361

if (archive_string_ensure(&(sc->utf16nfd), newsize) == NULL)

3362

return (-1);

3363

3364

inp = (UniChar *)sc->utf16nfc.s;

3365

inAvail = archive_strlen(&(sc->utf16nfc));

3366

sc->utf16nfd.length = 0;

3367

outp = sc->utf16nfd.s;

3368

outAvail = sc->utf16nfd.buffer_length -2;

3369

3370

do {

3371

/* Reinitialize all state information. */

3372

if (ResetUnicodeToTextInfo(sc->uniInfo) != noErr)

3373

goto return_no_changed_data;

3374

3375

inCount = outCount = 0;

3376

err = ConvertFromUnicodeToText(sc->uniInfo,

3377

inAvail, inp,

3378

kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL,

3379

outAvail, &inCount, &outCount, outp);

3380

3381

if (err == noErr) {

3382

sc->utf16nfd.length = outCount;

3383

sc->utf16nfd.s[sc->utf16nfd.length] = 0;

3384

sc->utf16nfd.s[sc->utf16nfd.length+1] = 0;

3385

} else if (err == kTECOutputBufferFullStatus) {

3386

newsize = inAvail - inCount;

3387

if (newsize > inAvail)

3388

newsize = inAvail;

3389

newsize += sc->utf16nfd.buffer_length + 2;

3390

if (archive_string_ensure(&(sc->utf16nfd), newsize)

3391

== NULL)

3392

return (-1);

3393

outp = sc->utf16nfd.s;

3394

outAvail = sc->utf16nfd.buffer_length -2;

3395

} else

3396

goto return_no_changed_data;

3397

} while (err == kTECOutputBufferFullStatus);

3398

3399

3400

* If there is a next-step conversion, we should convert

3401

* a UTF-16LE(NFD) string back to the original Unicode type.

3402

3403

saved_flag = sc->flag;/* save a flag. */

3404

if (!(sc->flag &

3405

(SCONV_TO_UTF16BE | SCONV_TO_UTF16LE | SCONV_TO_UTF8))) {

3406

3407

* This case is going to be converted to another

3408

* character-set through iconv.

3409

3410

if (sc->flag & SCONV_FROM_UTF16BE)

3411

sc->flag |= SCONV_TO_UTF16BE;

3412

else if (sc->flag & SCONV_FROM_UTF16LE)

3413

sc->flag |= SCONV_TO_UTF16LE;

3414

else

3415

sc->flag |= SCONV_TO_UTF8;

3416

}

3417

sc->flag &= ~(SCONV_FROM_UTF16BE | SCONV_FROM_UTF8);

3418

sc->flag |= SCONV_FROM_UTF16LE;

3419

if (archive_string_append_unicode(as, sc->utf16nfd.s,

3420

sc->utf16nfd.length, sc) != 0)

3421

ret = -1;

3422

sc->flag = saved_flag;/* restore the saved flag */

3423

return (ret);

3424

3425

return_no_changed_data:

3426

3427

* Something conversion error happened, so we return a no normalized

3428

* string with an error.

3429

3430

(void)archive_string_append_unicode(as, _p, len, sc);

3431

return (-1);

3432

}

3433

3434

#endif /* __APPLE__ */

3435

3436

3437

* libarchive 2.x made incorrect UTF-8 strings in the wrong assumption

3438

* that WCS is Unicode. It is true for several platforms but some are false.

3439

* And then people who did not use UTF-8 locale on the non Unicode WCS

3440

* platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those

3441

* now cannot get right filename from libarchive 3.x and later since we

3442

* fixed the wrong assumption and it is incompatible to older its versions.

3443

* So we provide special option, "compat-2x.x", for resolving it.

3444

* That option enable the string conversion of libarchive 2.x.

3445

3446

* Translates the wrong UTF-8 string made by libarchive 2.x into current

3447

* locale character set and appends to the archive_string.

3448

* Note: returns -1 if conversion fails.

3449

3450

static int

3451

strncat_from_utf8_libarchive2(struct archive_string *as,

3452

const void *_p, size_t len, struct archive_string_conv *sc)

3453

{

3454

const char *s;

3455

int n;

3456

char *p;

3457

char *end;

3458

uint32_t unicode;

3459

#if HAVE_WCRTOMB

3460

mbstate_t shift_state;

3461

3462

memset(&shift_state, 0, sizeof(shift_state));

3463

#else

3464

/* Clear the shift state before starting. */

3465

wctomb(NULL, L'\0');

3466

#endif

3467

(void)sc; /* UNUSED */

3468

3469

* Allocate buffer for MBS.

3470

* We need this allocation here since it is possible that

3471

* as->s is still NULL.

3472

3473

if (archive_string_ensure(as, as->length + len + 1) == NULL)

3474

return (-1);

3475

3476

s = (const char *)_p;

3477

p = as->s + as->length;

3478

end = as->s + as->buffer_length - MB_CUR_MAX -1;

3479

while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) {

3480

wchar_t wc;

3481

3482

if (p >= end) {

3483

as->length = p - as->s;

3484

/* Re-allocate buffer for MBS. */

3485

if (archive_string_ensure(as,

3486

as->length + len * 2 + 1) == NULL)

3487

return (-1);

3488

p = as->s + as->length;

3489

end = as->s + as->buffer_length - MB_CUR_MAX -1;

3490

}

3491

3492

3493

* As libarchie 2.x, translates the UTF-8 characters into

3494

* wide-characters in the assumption that WCS is Unicode.

3495

3496

if (n < 0) {

3497

n *= -1;

3498

wc = L'?';

3499

} else

3500

wc = (wchar_t)unicode;

3501

3502

s += n;

3503

len -= n;

3504

3505

* Translates the wide-character into the current locale MBS.

3506

3507

#if HAVE_WCRTOMB

3508

n = wcrtomb(p, wc, &shift_state);

3509

#else

3510

n = wctomb(p, wc);

3511

#endif

443

3512

if (n == -1)

444

return (NULL);

3513

return (-1);

445

3514

p += n;

446

3515

}

447

*p = '\0';

448

archive_strcat(as, buff);

449

return (as);

450

#endif

451

}

452

453

#endif /* _WIN32 && ! __CYGWIN__ */

3516

as->length = p - as->s;

3517

as->s[as->length] = '\0';

3518

return (0);

3519

}

3520

3521

3522

3523

* Conversion functions between current locale dependent MBS and UTF-16BE.

3524

* strncat_from_utf16be() : UTF-16BE --> MBS

3525

* strncat_to_utf16be() : MBS --> UTF16BE

3526

3527

3528

#if defined(_WIN32) && !defined(__CYGWIN__)

3529

3530

3531

* Convert a UTF-16BE/LE string to current locale and copy the result.

3532

* Return -1 if conversion failes.

3533

3534

static int

3535

win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes,

3536

struct archive_string_conv *sc, int be)

3537

{

3538

struct archive_string tmp;

3539

const char *u16;

3540

int ll;

3541

BOOL defchar;

3542

char *mbs;

3543

size_t mbs_size, b;

3544

int ret = 0;

3545

3546

bytes &= ~1;

3547

if (archive_string_ensure(as, as->length + bytes +1) == NULL)

3548

return (-1);

3549

3550

mbs = as->s + as->length;

3551

mbs_size = as->buffer_length - as->length -1;

3552

3553

if (sc->to_cp == CP_C_LOCALE) {

3554

3555

* "C" locale special process.

3556

3557

u16 = _p;

3558

ll = 0;

3559

for (b = 0; b < bytes; b += 2) {

3560

uint16_t val;

3561

if (be)

3562

val = archive_be16dec(u16+b);

3563

else

3564

val = archive_le16dec(u16+b);

3565

if (val > 255) {

3566

*mbs++ = '?';

3567

ret = -1;

3568

} else

3569

*mbs++ = (char)(val&0xff);

3570

ll++;

3571

}

3572

as->length += ll;

3573

as->s[as->length] = '\0';

3574

return (ret);

3575

}

3576

3577

archive_string_init(&tmp);

3578

if (be) {

3579

if (is_big_endian()) {

3580

u16 = _p;

3581

} else {

3582

if (archive_string_ensure(&tmp, bytes+2) == NULL)

3583

return (-1);

3584

memcpy(tmp.s, _p, bytes);

3585

for (b = 0; b < bytes; b += 2) {

3586

uint16_t val = archive_be16dec(tmp.s+b);

3587

archive_le16enc(tmp.s+b, val);

3588

}

3589

u16 = tmp.s;

3590

}

3591

} else {

3592

if (!is_big_endian()) {

3593

u16 = _p;

3594

} else {

3595

if (archive_string_ensure(&tmp, bytes+2) == NULL)

3596

return (-1);

3597

memcpy(tmp.s, _p, bytes);

3598

for (b = 0; b < bytes; b += 2) {

3599

uint16_t val = archive_le16dec(tmp.s+b);

3600

archive_be16enc(tmp.s+b, val);

3601

}

3602

u16 = tmp.s;

3603

}

3604

}

3605

3606

do {

3607

defchar = 0;

3608

ll = WideCharToMultiByte(sc->to_cp, 0,

3609

(LPCWSTR)u16, bytes>>1, mbs, mbs_size,

3610

NULL, &defchar);

3611

if (ll == 0 &&

3612

GetLastError() == ERROR_INSUFFICIENT_BUFFER) {

3613

/* Need more buffer for MBS. */

3614

ll = WideCharToMultiByte(sc->to_cp, 0,

3615

(LPCWSTR)u16, bytes, NULL, 0, NULL, NULL);

3616

if (archive_string_ensure(as, ll +1) == NULL)

3617

return (-1);

3618

mbs = as->s + as->length;

3619

mbs_size = as->buffer_length - as->length -1;

3620

continue;

3621

}

3622

} while (0);

3623

archive_string_free(&tmp);

3624

as->length += ll;

3625

as->s[as->length] = '\0';

3626

if (ll == 0 || defchar)

3627

ret = -1;

3628

return (ret);

3629

}

3630

3631

static int

3632

win_strncat_from_utf16be(struct archive_string *as, const void *_p, size_t bytes,

3633

struct archive_string_conv *sc)

3634

{

3635

return (win_strncat_from_utf16(as, _p, bytes, sc, 1));

3636

}

3637

3638

static int

3639

win_strncat_from_utf16le(struct archive_string *as, const void *_p, size_t bytes,

3640

struct archive_string_conv *sc)

3641

{

3642

return (win_strncat_from_utf16(as, _p, bytes, sc, 0));

3643

}

3644

3645

static int

3646

is_big_endian(void)

3647

{

3648

uint16_t d = 1;

3649

3650

return (archive_be16dec(&d) == 1);

3651

}

3652

3653

3654

* Convert a current locale string to UTF-16BE/LE and copy the result.

3655

* Return -1 if conversion failes.

3656

3657

static int

3658

win_strncat_to_utf16(struct archive_string *as16, const void *_p, size_t length,

3659

struct archive_string_conv *sc, int bigendian)

3660

{

3661

const char *s = (const char *)_p;

3662

char *u16;

3663

size_t count, avail;

3664

3665

if (archive_string_ensure(as16,

3666

as16->length + (length + 1) * 2) == NULL)

3667

return (-1);

3668

3669

u16 = as16->s + as16->length;

3670

avail = as16->buffer_length - 2;

3671

if (sc->from_cp == CP_C_LOCALE) {

3672

3673

* "C" locale special process.

3674

3675

count = 0;

3676

while (count < length && *s) {

3677

if (bigendian)

3678

archive_be16enc(u16, *s);

3679

else

3680

archive_le16enc(u16, *s);

3681

u16 += 2;

3682

s++;

3683

count++;

3684

}

3685

as16->length += count << 1;

3686

as16->s[as16->length] = 0;

3687

as16->s[as16->length+1] = 0;

3688

return (0);

3689

}

3690

do {

3691

count = MultiByteToWideChar(sc->from_cp,

3692

MB_PRECOMPOSED, s, length, (LPWSTR)u16, (int)avail>>1);

3693

if (count == 0 &&

3694

GetLastError() == ERROR_INSUFFICIENT_BUFFER) {

3695

/* Need more buffer for UTF-16 string */

3696

count = MultiByteToWideChar(sc->from_cp,

3697

MB_PRECOMPOSED, s, length, NULL, 0);

3698

if (archive_string_ensure(as16, (count +1) * 2)

3699

== NULL)

3700

return (-1);

3701

u16 = as16->s + as16->length;

3702

avail = as16->buffer_length - 2;

3703

continue;

3704

}

3705

} while (0);

3706

as16->length += count * 2;

3707

as16->s[as16->length] = 0;

3708

as16->s[as16->length+1] = 0;

3709

if (count == 0)

3710

return (-1);

3711

3712

if (is_big_endian()) {

3713

if (!bigendian) {

3714

while (count > 0) {

3715

uint16_t v = archive_be16dec(u16);

3716

archive_le16enc(u16, v);

3717

u16 += 2;

3718

count--;

3719

}

3720

}

3721

} else {

3722

if (bigendian) {

3723

while (count > 0) {

3724

uint16_t v = archive_le16dec(u16);

3725

archive_be16enc(u16, v);

3726

u16 += 2;

3727

count--;

3728

}

3729

}

3730

}

3731

return (0);

3732

}

3733

3734

static int

3735

win_strncat_to_utf16be(struct archive_string *as16, const void *_p, size_t length,

3736

struct archive_string_conv *sc)

3737

{

3738

return (win_strncat_to_utf16(as16, _p, length, sc, 1));

3739

}

3740

3741

static int

3742

win_strncat_to_utf16le(struct archive_string *as16, const void *_p, size_t length,

3743

struct archive_string_conv *sc)

3744

{

3745

return (win_strncat_to_utf16(as16, _p, length, sc, 0));

3746

}

3747

3748

#endif /* _WIN32 && !__CYGWIN__ */

3749

3750

3751

* Do the best effort for conversions.

3752

* We cannot handle UTF-16BE character-set without such iconv,

3753

* but there is a chance if a string consists just ASCII code or

3754

* a current locale is UTF-8.

3755

3756

3757

3758

* Convert a UTF-16BE string to current locale and copy the result.

3759

* Return -1 if conversion failes.

3760

3761

static int

3762

best_effort_strncat_from_utf16(struct archive_string *as, const void *_p,

3763

size_t bytes, struct archive_string_conv *sc, int be)

3764

{

3765

const char *utf16 = (const char *)_p;

3766

char *mbs;

3767

uint32_t uc;

3768

int n, ret;

3769

3770

(void)sc; /* UNUSED */

3771

3772

* Other case, we should do the best effort.

3773

* If all character are ASCII(<0x7f), we can convert it.

3774

* if not , we set a alternative character and return -1.

3775

3776

ret = 0;

3777

if (archive_string_ensure(as, as->length + bytes +1) == NULL)

3778

return (-1);

3779

mbs = as->s + as->length;

3780

3781

while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) {

3782

if (n < 0) {

3783

n *= -1;

3784

ret = -1;

3785

}

3786

bytes -= n;

3787

utf16 += n;

3788

3789

if (uc > 127) {

3790

/* We cannot handle it. */

3791

*mbs++ = '?';

3792

ret = -1;

3793

} else

3794

*mbs++ = (char)uc;

3795

}

3796

as->length = mbs - as->s;

3797

as->s[as->length] = '\0';

3798

return (ret);

3799

}

3800

3801

static int

3802

best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p,

3803

size_t bytes, struct archive_string_conv *sc)

3804

{

3805

return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1));

3806

}

3807

3808

static int

3809

best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p,

3810

size_t bytes, struct archive_string_conv *sc)

3811

{

3812

return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0));

3813

}

3814

3815

3816

* Convert a current locale string to UTF-16BE/LE and copy the result.

3817

* Return -1 if conversion failes.

3818

3819

static int

3820

best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p,

3821

size_t length, struct archive_string_conv *sc, int bigendian)

3822

{

3823

const char *s = (const char *)_p;

3824

char *utf16;

3825

size_t remaining;

3826

int ret;

3827

3828

(void)sc; /* UNUSED */

3829

3830

* Other case, we should do the best effort.

3831

* If all character are ASCII(<0x7f), we can convert it.

3832

* if not , we set a alternative character and return -1.

3833

3834

ret = 0;

3835

remaining = length;

3836

3837

if (archive_string_ensure(as16,

3838

as16->length + (length + 1) * 2) == NULL)

3839

return (-1);

3840

3841

utf16 = as16->s + as16->length;

3842

while (remaining--) {

3843

unsigned c = *s++;

3844

if (c > 127) {

3845

/* We cannot handle it. */

3846

c = UNICODE_R_CHAR;

3847

ret = -1;

3848

}

3849

if (bigendian)

3850

archive_be16enc(utf16, c);

3851

else

3852

archive_le16enc(utf16, c);

3853

utf16 += 2;

3854

}

3855

as16->length = utf16 - as16->s;

3856

as16->s[as16->length] = 0;

3857

as16->s[as16->length+1] = 0;

3858

return (ret);

3859

}

3860

3861

static int

3862

best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p,

3863

size_t length, struct archive_string_conv *sc)

3864

{

3865

return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1));

3866

}

3867

3868

static int

3869

best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p,

3870

size_t length, struct archive_string_conv *sc)

3871

{

3872

return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0));

3873

}

3874

3875

3876

3877

* Multistring operations.

3878

3879

3880

void

3881

archive_mstring_clean(struct archive_mstring *aes)

3882

{

3883

archive_wstring_free(&(aes->aes_wcs));

3884

archive_string_free(&(aes->aes_mbs));

3885

archive_string_free(&(aes->aes_utf8));

3886

archive_string_free(&(aes->aes_mbs_in_locale));

3887

aes->aes_set = 0;

3888

}

3889

3890

void

3891

archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src)

3892

{

3893

dest->aes_set = src->aes_set;

3894

archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs));

3895

archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8));

3896

archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs));

3897

}

3898

3899

int

3900

archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,

3901

const char **p)

3902

{

3903

struct archive_string_conv *sc;

3904

int r;

3905

3906

/* If we already have a UTF8 form, return that immediately. */

3907

if (aes->aes_set & AES_SET_UTF8) {

3908

*p = aes->aes_utf8.s;

3909

return (0);

3910

}

3911

3912

*p = NULL;

3913

if (aes->aes_set & AES_SET_MBS) {

3914

sc = archive_string_conversion_to_charset(a, "UTF-8", 1);

3915

if (sc == NULL)

3916

return (-1);/* Couldn't allocate memory for sc. */

3917

r = archive_strncpy_in_locale(&(aes->aes_mbs), aes->aes_mbs.s,

3918

aes->aes_mbs.length, sc);

3919

if (a == NULL)

3920

free_sconv_object(sc);

3921

if (r == 0) {

3922

aes->aes_set |= AES_SET_UTF8;

3923

*p = aes->aes_utf8.s;

3924

return (0);/* success. */

3925

} else

3926

return (-1);/* failure. */

3927

}

3928

return (0);/* success. */

3929

}

3930

3931

int

3932

archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes,

3933

const char **p)

3934

{

3935

int r, ret = 0;

3936

3937

(void)a; /* UNUSED */

3938

/* If we already have an MBS form, return that immediately. */

3939

if (aes->aes_set & AES_SET_MBS) {

3940

*p = aes->aes_mbs.s;

3941

return (ret);

3942

}

3943

3944

*p = NULL;

3945

/* If there's a WCS form, try converting with the native locale. */

3946

if (aes->aes_set & AES_SET_WCS) {

3947

archive_string_empty(&(aes->aes_mbs));

3948

r = archive_string_append_from_wcs(&(aes->aes_mbs),

3949

aes->aes_wcs.s, aes->aes_wcs.length);

3950

*p = aes->aes_mbs.s;

3951

if (r == 0) {

3952

aes->aes_set |= AES_SET_MBS;

3953

return (ret);

3954

} else

3955

ret = -1;

3956

}

3957

3958

3959

* Only a UTF-8 form cannot avail because its conversion already

3960

* failed at archive_mstring_update_utf8().

3961

3962

return (ret);

3963

}

3964

3965

int

3966

archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,

3967

const wchar_t **wp)

3968

{

3969

int r, ret = 0;

3970

3971

(void)a;/* UNUSED */

3972

/* Return WCS form if we already have it. */

3973

if (aes->aes_set & AES_SET_WCS) {

3974

*wp = aes->aes_wcs.s;

3975

return (ret);

3976

}

3977

3978

*wp = NULL;

3979

/* Try converting MBS to WCS using native locale. */

3980

if (aes->aes_set & AES_SET_MBS) {

3981

archive_wstring_empty(&(aes->aes_wcs));

3982

r = archive_wstring_append_from_mbs(&(aes->aes_wcs),

3983

aes->aes_mbs.s, aes->aes_mbs.length);

3984

if (r == 0) {

3985

aes->aes_set |= AES_SET_WCS;

3986

*wp = aes->aes_wcs.s;

3987

} else

3988

ret = -1;/* failure. */

3989

}

3990

return (ret);

3991

}

3992

3993

int

3994

archive_mstring_get_mbs_l(struct archive_mstring *aes,

3995

const char **p, size_t *length, struct archive_string_conv *sc)

3996

{

3997

int r, ret = 0;

3998

3999

#if defined(_WIN32) && !defined(__CYGWIN__)

4000

4001

* Internationalization programing on Windows must use Wide

4002

* characters because Windows platform cannot make locale UTF-8.

4003

4004

if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) {

4005

archive_string_empty(&(aes->aes_mbs_in_locale));

4006

r = archive_string_append_from_wcs_in_codepage(

4007

&(aes->aes_mbs_in_locale), aes->aes_wcs.s,

4008

aes->aes_wcs.length, sc);

4009

if (r == 0) {

4010

*p = aes->aes_mbs_in_locale.s;

4011

if (length != NULL)

4012

*length = aes->aes_mbs_in_locale.length;

4013

return (0);

4014

} else if (errno == ENOMEM)

4015

return (-1);

4016

else

4017

ret = -1;

4018

}

4019

#endif

4020

4021

/* If there is not an MBS form but is a WCS form, try converting

4022

* with the native locale to be used for translating it to specified

4023

* character-set. */

4024

if ((aes->aes_set & AES_SET_MBS) == 0 &&

4025

(aes->aes_set & AES_SET_WCS) != 0) {

4026

archive_string_empty(&(aes->aes_mbs));

4027

r = archive_string_append_from_wcs(&(aes->aes_mbs),

4028

aes->aes_wcs.s, aes->aes_wcs.length);

4029

if (r == 0)

4030

aes->aes_set |= AES_SET_MBS;

4031

else if (errno == ENOMEM)

4032

return (-1);

4033

else

4034

ret = -1;

4035

}

4036

/* If we already have an MBS form, use it to be translated to

4037

* specified character-set. */

4038

if (aes->aes_set & AES_SET_MBS) {

4039

if (sc == NULL) {

4040

/* Conversion is unneeded. */

4041

*p = aes->aes_mbs.s;

4042

if (length != NULL)

4043

*length = aes->aes_mbs.length;

4044

return (0);

4045

}

4046

ret = archive_strncpy_in_locale(&(aes->aes_mbs_in_locale),

4047

aes->aes_mbs.s, aes->aes_mbs.length, sc);

4048

*p = aes->aes_mbs_in_locale.s;

4049

if (length != NULL)

4050

*length = aes->aes_mbs_in_locale.length;

4051

} else {

4052

*p = NULL;

4053

if (length != NULL)

4054

*length = 0;

4055

}

4056

return (ret);

4057

}

4058

4059

int

4060

archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs)

4061

{

4062

if (mbs == NULL) {

4063

aes->aes_set = 0;

4064

return (0);

4065

}

4066

return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs)));

4067

}

4068

4069

int

4070

archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs,

4071

size_t len)

4072

{

4073

if (mbs == NULL) {

4074

aes->aes_set = 0;

4075

return (0);

4076

}

4077

aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */

4078

archive_strncpy(&(aes->aes_mbs), mbs, len);

4079

archive_string_empty(&(aes->aes_utf8));

4080

archive_wstring_empty(&(aes->aes_wcs));

4081

return (0);

4082

}

4083

4084

int

4085

archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs)

4086

{

4087

return archive_mstring_copy_wcs_len(aes, wcs, wcs == NULL ? 0 : wcslen(wcs));

4088

}

4089

4090

int

4091

archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs,

4092

size_t len)

4093

{

4094

if (wcs == NULL) {

4095

aes->aes_set = 0;

4096

}

4097

aes->aes_set = AES_SET_WCS; /* Only WCS form set. */

4098

archive_string_empty(&(aes->aes_mbs));

4099

archive_string_empty(&(aes->aes_utf8));

4100

archive_wstrncpy(&(aes->aes_wcs), wcs, len);

4101

return (0);

4102

}

4103

4104

int

4105

archive_mstring_copy_mbs_len_l(struct archive_mstring *aes,

4106

const char *mbs, size_t len, struct archive_string_conv *sc)

4107

{

4108

int r;

4109

4110

if (mbs == NULL) {

4111

aes->aes_set = 0;

4112

return (0);

4113

}

4114

archive_string_empty(&(aes->aes_mbs));

4115

archive_wstring_empty(&(aes->aes_wcs));

4116

archive_string_empty(&(aes->aes_utf8));

4117

#if defined(_WIN32) && !defined(__CYGWIN__)

4118

4119

* Internationalization programing on Windows must use Wide

4120

* characters because Windows platform cannot make locale UTF-8.

4121

4122

if (sc == NULL) {

4123

if (archive_string_append(&(aes->aes_mbs),

4124

mbs, mbsnbytes(mbs, len)) == NULL) {

4125

aes->aes_set = 0;

4126

r = -1;

4127

} else {

4128

aes->aes_set = AES_SET_MBS;

4129

r = 0;

4130

}

4131

#if defined(HAVE_ICONV)

4132

} else if (sc != NULL && sc->cd_w != (iconv_t)-1) {

4133

4134

* This case happens only when MultiByteToWideChar() cannot

4135

* handle sc->from_cp, and we have to iconv in order to

4136

* translate character-set to wchar_t,UTF-16.

4137

4138

iconv_t cd = sc->cd;

4139

unsigned from_cp;

4140

int flag;

4141

4142

4143

* Translate multi-bytes from some character-set to UTF-8.

4144

4145

sc->cd = sc->cd_w;

4146

r = archive_strncpy_in_locale(&(aes->aes_utf8), mbs, len, sc);

4147

sc->cd = cd;

4148

if (r != 0) {

4149

aes->aes_set = 0;

4150

return (r);

4151

}

4152

aes->aes_set = AES_SET_UTF8;

4153

4154

4155

* Append the UTF-8 string into wstring.

4156

4157

flag = sc->flag;

4158

sc->flag &= ~(SCONV_NORMALIZATION_C

4159

| SCONV_TO_UTF16| SCONV_FROM_UTF16);

4160

from_cp = sc->from_cp;

4161

sc->from_cp = CP_UTF8;

4162

r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),

4163

aes->aes_utf8.s, aes->aes_utf8.length, sc);

4164

sc->flag = flag;

4165

sc->from_cp = from_cp;

4166

if (r == 0)

4167

aes->aes_set |= AES_SET_WCS;

4168

#endif

4169

} else {

4170

r = archive_wstring_append_from_mbs_in_codepage(

4171

&(aes->aes_wcs), mbs, len, sc);

4172

if (r == 0)

4173

aes->aes_set = AES_SET_WCS;

4174

else

4175

aes->aes_set = 0;

4176

}

4177

#else

4178

r = archive_strncpy_in_locale(&(aes->aes_mbs), mbs, len, sc);

4179

if (r == 0)

4180

aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */

4181

else

4182

aes->aes_set = 0;

4183

#endif

4184

return (r);

4185

}

4186

4187

4188

* The 'update' form tries to proactively update all forms of

4189

* this string (WCS and MBS) and returns an error if any of

4190

* them fail. This is used by the 'pax' handler, for instance,

4191

* to detect and report character-conversion failures early while

4192

* still allowing clients to get potentially useful values from

4193

* the more tolerant lazy conversions. (get_mbs and get_wcs will

4194

* strive to give the user something useful, so you can get hopefully

4195

* usable values even if some of the character conversions are failing.)

4196

4197

int

4198

archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,

4199

const char *utf8)

4200

{

4201

struct archive_string_conv *sc;

4202

int r;

4203

4204

if (utf8 == NULL) {

4205

aes->aes_set = 0;

4206

return (0); /* Succeeded in clearing everything. */

4207

}

4208

4209

/* Save the UTF8 string. */

4210

archive_strcpy(&(aes->aes_utf8), utf8);

4211

4212

/* Empty the mbs and wcs strings. */

4213

archive_string_empty(&(aes->aes_mbs));

4214

archive_wstring_empty(&(aes->aes_wcs));

4215

4216

aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */

4217

4218

/* Try converting UTF-8 to MBS, return false on failure. */

4219

sc = archive_string_conversion_from_charset(a, "UTF-8", 1);

4220

if (sc == NULL)

4221

return (-1);/* Couldn't allocate memory for sc. */

4222

r = archive_strcpy_in_locale(&(aes->aes_mbs), utf8, sc);

4223

if (a == NULL)

4224

free_sconv_object(sc);

4225

if (r != 0)

4226

return (-1);

4227

aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */

4228

4229

/* Try converting MBS to WCS, return false on failure. */

4230

if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,

4231

aes->aes_mbs.length))

4232

return (-1);

4233

aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;

4234

4235

/* All conversions succeeded. */

4236

return (0);

4237

}

Older »