45
43
static const size_t iconv_failed = (size_t)-1;
49
bool operator()(const char* s1, const char* s2) const {
50
return strcmp(s1, s2) < 0;
46
using namespace agi::charset;
54
agi::charset::Converter *get_converter(bool subst, const char *src, const char *dst);
48
Converter *get_converter(bool subst, const char *src, const char *dst);
56
50
/// @brief Map a user-friendly encoding name to the real encoding name
57
const char* get_real_encoding_name(const char* name) {
58
static std::map<const char*, const char*, ltstr> pretty_names;
60
if (pretty_names.empty()) {
61
# define ADD(pretty, real) pretty_names[pretty] = real
51
const char *get_real_encoding_name(const char *name) {
52
struct pair { const char *pretty; const char *real; };
53
static pair pretty_names[] = {
54
# define ADD(pretty, real) pair{pretty, real},
62
55
# include <libaegisub/charsets.def>
66
auto real = pretty_names.find(name);
67
if (real != pretty_names.end())
59
static bool init = false;
62
boost::sort(pretty_names, [](pair a, pair b) {
63
return strcmp(a.pretty, b.pretty) < 0;
67
auto enc = boost::lower_bound(pretty_names, name, [](pair a, const char *b) {
68
return strcmp(a.pretty, b) < 0;
71
if (enc != std::end(pretty_names) && strcmp(enc->pretty, name) == 0)
72
size_t get_bom_size(iconv_t cd) {
76
size_t get_bom_size(Iconv& cd) {
73
77
// Most (but not all) iconv implementations automatically insert a BOM
74
78
// at the beginning of text converted to UTF-8, UTF-16 and UTF-32, but
75
79
// we usually don't want this, as some of the wxString using code
86
size_t res = iconv(cd, ICONV_CONST_CAST(&src), &srcLen, &dst, &dstLen);
90
size_t res = cd(&src, &srcLen, &dst, &dstLen);
87
91
assert(res != iconv_failed);
88
92
assert(srcLen == 0);
101
void eat_bom(iconv_t cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
105
void eat_bom(Iconv& cd, size_t bomSize, const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
102
106
// If this encoding has a forced BOM (i.e. it's UTF-16 or UTF-32 without
103
107
// a specified byte order), skip over it
104
108
if (bomSize > 0 && inbytesleft && *inbytesleft) {
110
114
size_t dstSize = std::min((size_t)8, bomSize + *outbytesleft);
111
115
const char *src = *inbuf;
112
116
size_t srcSize = *inbytesleft;
113
iconv(cd, ICONV_CONST_CAST(&src), &srcSize, &dst, &dstSize);
117
cd(&src, &srcSize, &dst, &dstSize);
117
121
// Calculate the size of NUL in the given character set
118
size_t nul_size(const char* encoding) {
122
size_t nul_size(const char *encoding) {
119
123
// We need a character set to convert from with a known encoding of NUL
120
124
// UTF-8 seems like the obvious choice
121
std::unique_ptr<agi::charset::Converter> cd(get_converter(false, "UTF-8", encoding));
125
std::unique_ptr<Converter> cd(get_converter(false, "UTF-8", encoding));
124
128
char sbuff[] = "";
137
141
#ifdef ICONV_POSIX
138
class ConverterImpl : public agi::charset::Converter {
142
class ConverterImpl final : public Converter {
142
146
// subst is not used here because POSIX doesn't let you disable substitution
143
147
ConverterImpl(bool, const char* sourceEncoding, const char* destEncoding)
145
149
const char *dstEnc = get_real_encoding_name(destEncoding);
146
cd = iconv_open(dstEnc, "UTF-8");
147
if (cd == iconv_invalid)
148
throw agi::charset::UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
150
cd = Iconv("utf-8", dstEnc);
150
152
bomSize = get_bom_size(cd);
152
cd = iconv_open(dstEnc, get_real_encoding_name(sourceEncoding));
153
if (cd == iconv_invalid)
154
throw agi::charset::UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
157
if (cd != iconv_invalid) iconv_close(cd);
153
cd = Iconv(get_real_encoding_name(sourceEncoding), dstEnc);
159
156
size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) {
160
157
eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
162
size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
159
size_t res = cd(inbuf, inbytesleft, outbuf, outbytesleft);
164
161
// This loop never does anything useful with a POSIX-compliant iconv
165
162
// implementation, but those don't seem to actually exist
166
163
while (res == iconv_failed && errno != E2BIG) {
169
res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
166
res = cd(inbuf, inbytesleft, outbuf, outbytesleft);
178
class ConverterImpl : public iconv_fallbacks, public agi::charset::Converter {
175
class ConverterImpl final : public iconv_fallbacks, public Converter {
180
177
char invalidRep[8];
181
178
size_t invalidRepSize;
183
180
static void fallback(
184
181
unsigned int code,
185
182
void (*callback) (const char *buf, size_t buflen, void* callback_arg),
196
193
callback(self->invalidRep, self->invalidRepSize, callback_arg);
199
ConverterImpl(ConverterImpl const&);
200
ConverterImpl& operator=(ConverterImpl const&);
202
198
ConverterImpl(bool subst, const char* sourceEncoding, const char* destEncoding)
204
200
const char *dstEnc = get_real_encoding_name(destEncoding);
205
cd = iconv_open(dstEnc, "UTF-8");
206
if (cd == iconv_invalid)
207
throw agi::charset::UnsupportedConversion(std::string(dstEnc) + " is not a supported character set");
201
cd = Iconv("utf-8", dstEnc);
209
203
bomSize = get_bom_size(cd);
222
216
invalidRepSize = 4 - dstLen;
225
cd = iconv_open(dstEnc, get_real_encoding_name(sourceEncoding));
226
if (cd == iconv_invalid)
227
throw agi::charset::UnsupportedConversion(std::string("Cannot convert from ") + sourceEncoding + " to " + destEncoding);
218
cd = Iconv(get_real_encoding_name(sourceEncoding), dstEnc);
238
229
iconvctl(cd, ICONV_SET_FALLBACKS, static_cast<iconv_fallbacks*>(this));
242
if (cd != iconv_invalid) iconv_close(cd);
244
233
size_t Convert(const char** inbuf, size_t* inbytesleft, char** outbuf, size_t* outbytesleft) override {
245
234
eat_bom(cd, bomSize, inbuf, inbytesleft, outbuf, outbytesleft);
246
size_t res = iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
235
size_t res = cd(inbuf, inbytesleft, outbuf, outbytesleft);
248
237
if (res == iconv_failed && errno == E2BIG && *outbytesleft == 0) {
249
238
// libiconv checks if there are any bytes left in the output buffer
256
245
const char* in = *inbuf;
257
246
size_t insize = *inbytesleft;
259
res = iconv(cd, ICONV_CONST_CAST(&in), &insize, &out, &buffsize);
248
res = cd(&in, &insize, &out, &buffsize);
260
249
// If no bytes of the output buffer were used, the original
261
250
// conversion may have been successful
262
251
if (buffsize != 8) {
273
agi::charset::Converter *get_converter(bool subst, const char *src, const char *dst) {
262
Converter *get_converter(bool subst, const char *src, const char *dst) {
275
264
return new ConverterImpl(subst, src, dst);
277
catch (agi::charset::UnsupportedConversion const&) {
266
catch (UnsupportedConversion const&) {
278
267
if (strcmp(dst, "ISO-6937-2"))
280
return new agi::charset::Converter6937(subst, src);
269
return new Converter6937(subst, src);
285
274
namespace agi { namespace charset {
275
Iconv::Iconv() : cd(iconv_invalid) { }
277
Iconv::Iconv(const char *source, const char *dest)
278
: cd(iconv_open(dest, source))
280
if (cd == iconv_invalid)
281
throw UnsupportedConversion(std::string("Cannot convert from ") + source + " to " + dest);
285
if (cd != iconv_invalid) iconv_close(cd);
288
size_t Iconv::operator()(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) {
289
return iconv(cd, ICONV_CONST_CAST(inbuf), inbytesleft, outbuf, outbytesleft);
287
292
IconvWrapper::IconvWrapper(const char* sourceEncoding, const char* destEncoding, bool enableSubst)
288
293
: conv(get_converter(enableSubst, sourceEncoding, destEncoding))
292
297
toNulLen = nul_size(destEncoding);
293
298
fromNulLen = nul_size(sourceEncoding);
295
IconvWrapper::~IconvWrapper() {
298
std::string IconvWrapper::Convert(std::string const& source) {
301
IconvWrapper::~IconvWrapper() { }
303
std::string IconvWrapper::Convert(const char *source, size_t len) {
299
304
std::string dest;
300
Convert(source, dest);
305
Convert(source, len, dest);
303
void IconvWrapper::Convert(std::string const& source, std::string &dest) {
309
void IconvWrapper::Convert(const char *src, size_t srcLen, std::string &dest) {
306
const char *src = source.data();
307
size_t srcLen = source.size();
310
314
char *dst = buff;