45
45
#define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
46
46
#define UNSPECIFIED_ENCODING INT_MAX
48
#define ENCODING_NAMELEN_MAX 63
49
#define valid_encoding_name_p(name) ((name) && strlen(name) <= ENCODING_NAMELEN_MAX)
48
51
#define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
50
53
static int load_encoding(const char *name);
56
enc_memsize(const void *p)
61
static const rb_data_type_t encoding_data_type = {
62
"encoding", 0, 0, enc_memsize,
65
#define is_data_encoding(obj) (RTYPEDDATA_P(obj) && RTYPEDDATA_TYPE(obj) == &encoding_data_type)
58
68
enc_new(rb_encoding *encoding)
60
return Data_Wrap_Struct(rb_cEncoding, enc_mark, 0, encoding);
70
return TypedData_Wrap_Struct(rb_cEncoding, &encoding_data_type, encoding);
64
rb_enc_from_encoding(rb_encoding *encoding)
74
rb_enc_from_encoding_index(int idx)
69
if (!encoding) return Qnil;
70
idx = ENC_TO_ENCINDEX(encoding);
71
78
if (!(list = rb_encoding_list)) {
72
rb_bug("rb_enc_from_encoding(%d\"%s\"): no rb_encoding_list",
73
idx, rb_enc_name(encoding));
79
rb_bug("rb_enc_from_encoding_index(%d): no rb_encoding_list", idx);
75
81
enc = rb_ary_entry(list, idx);
77
rb_bug("rb_enc_from_encoding(%d\"%s\"): not created yet",
78
idx, rb_enc_name(encoding));
83
rb_bug("rb_enc_from_encoding_index(%d): not created yet", idx);
89
rb_enc_from_encoding(rb_encoding *encoding)
92
if (!encoding) return Qnil;
93
idx = ENC_TO_ENCINDEX(encoding);
94
return rb_enc_from_encoding_index(idx);
83
97
static int enc_autoload(rb_encoding *);
98
112
enc_check_encoding(VALUE obj)
100
if (SPECIAL_CONST_P(obj) || BUILTIN_TYPE(obj) != T_DATA ||
101
RDATA(obj)->dmark != enc_mark) {
114
if (SPECIAL_CONST_P(obj) || !rb_typeddata_is_kind_of(obj, &encoding_data_type)) {
104
117
return check_encoding(RDATA(obj)->data);
293
* Set base encoding for encodings which are not replicas
294
* but not in their own files.
297
rb_enc_set_base(const char *name, const char *orig)
299
int idx = rb_enc_registered(name);
300
int origidx = rb_enc_registered(orig);
301
set_base_encoding(idx, rb_enc_from_index(origidx));
279
305
rb_enc_replicate(const char *name, rb_encoding *encoding)
318
* enc.replicate(name) -> encoding
320
* Returns a replicated encoding of _enc_ whose name is _name_.
321
* The new encoding should have the same byte structure of _enc_.
322
* If _name_ is used by another encoding, raise ArgumentError.
326
enc_replicate(VALUE encoding, VALUE name)
328
return rb_enc_from_encoding_index(
329
rb_enc_replicate(StringValueCStr(name),
330
rb_to_encoding(encoding)));
291
enc_replicate(int idx, const char *name, rb_encoding *origenc)
334
enc_replicate_with_index(const char *name, rb_encoding *origenc, int idx)
294
337
idx = enc_register(name, origenc);
312
355
if (origidx < 0) {
313
356
origidx = enc_register(orig, 0);
315
return enc_replicate(idx, name, rb_enc_from_index(origidx));
358
return enc_replicate_with_index(name, rb_enc_from_index(origidx), idx);
329
372
rb_encdb_dummy(const char *name)
331
int index = enc_replicate(rb_enc_registered(name), name,
332
rb_ascii8bit_encoding());
374
int index = enc_replicate_with_index(name, rb_ascii8bit_encoding(),
375
rb_enc_registered(name));
333
376
rb_encoding *enc = enc_table.list[index].enc;
335
378
ENC_SET_DUMMY(enc);
355
398
return ENC_DUMMY_P(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse;
403
* enc.ascii_compatible? -> true or false
405
* Returns whether ASCII-compatible or not.
407
* Encoding::UTF_8.ascii_compatible? #=> true
408
* Encoding::UTF_16BE.ascii_compatible? #=> false
412
enc_ascii_compatible_p(VALUE enc)
414
return rb_enc_asciicompat(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse;
418
* Returns 1 when the encoding is Unicode series other than UTF-7 else 0.
421
rb_enc_unicode_p(rb_encoding *enc)
423
const char *name = rb_enc_name(enc);
424
return name[0] == 'U' && name[1] == 'T' && name[2] == 'F' && name[4] != '7';
358
427
static const char *
359
428
enc_alias_internal(const char *alias, int idx)
367
436
enc_alias(const char *alias, int idx)
438
if (!valid_encoding_name_p(alias)) return -1;
369
439
alias = enc_alias_internal(alias, idx);
370
440
set_encoding_const(alias, rb_enc_from_index(idx));
617
687
/* enc_check_capable(obj);*/
618
688
if (rb_enc_get_index(obj) == idx)
620
690
if (SPECIAL_CONST_P(obj)) {
621
691
rb_raise(rb_eArgError, "cannot set encoding");
670
740
enc2 = rb_enc_from_index(idx2);
672
742
if (TYPE(str2) == T_STRING && RSTRING_LEN(str2) == 0)
743
return (idx1 == ENCINDEX_US_ASCII && rb_enc_asciicompat(enc2)) ? enc2 : enc1;
674
744
if (TYPE(str1) == T_STRING && RSTRING_LEN(str1) == 0)
745
return (idx2 == ENCINDEX_US_ASCII && rb_enc_asciicompat(enc1)) ? enc1 : enc2;
676
746
if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
811
rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
813
return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
741
817
rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
743
819
int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
757
833
return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
758
834
n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
760
return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(e-p));
836
return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(int)(e-p));
788
rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
864
rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
792
868
rb_raise(rb_eArgError, "empty string");
793
869
r = rb_enc_precise_mbclen(p, e, enc);
794
if (MBCLEN_CHARFOUND_P(r))
870
if (MBCLEN_CHARFOUND_P(r)) {
871
if (len_p) *len_p = MBCLEN_CHARFOUND_LEN(r);
795
872
return rb_enc_mbc_to_codepoint(p, e, enc);
797
875
rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(enc));
878
#undef rb_enc_codepoint
880
rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
882
return rb_enc_codepoint_len(p, e, 0, enc);
801
886
rb_enc_codelen(int c, rb_encoding *enc)
872
957
* Returns the list of name and aliases of the encoding.
874
* Encoding::WINDOWS_31J.names => ["Windows-31J", "CP932", "csWindows31J"]
959
* Encoding::WINDOWS_31J.names #=> ["Windows-31J", "CP932", "csWindows31J"]
877
962
enc_names(VALUE self)
889
* Encoding.list => [enc1, enc2, ...]
974
* Encoding.list -> [enc1, enc2, ...]
891
976
* Returns the list of loaded encodings.
894
* => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
895
* #<Encoding:ISO-2022-JP (dummy)>]
979
* #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
980
* #<Encoding:ISO-2022-JP (dummy)>]
897
982
* Encoding.find("US-ASCII")
898
* => #<Encoding:US-ASCII>
983
* #=> #<Encoding:US-ASCII>
901
* => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
902
* #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
986
* #=> [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
987
* #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
915
* Encoding.find(string) => enc
916
* Encoding.find(symbol) => enc
1000
* Encoding.find(string) -> enc
1001
* Encoding.find(symbol) -> enc
918
1003
* Search the encoding with specified <i>name</i>.
919
1004
* <i>name</i> should be a string or symbol.
921
* Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
922
* Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
1006
* Encoding.find("US-ASCII") #=> #<Encoding:US-ASCII>
1007
* Encoding.find(:Shift_JIS) #=> #<Encoding:Shift_JIS>
924
1009
* Names which this method accept are encoding names and aliases
925
1010
* including following special aliases
945
* Encoding.compatible?(str1, str2) => enc or nil
1030
* Encoding.compatible?(str1, str2) -> enc or nil
947
1032
* Checks the compatibility of two strings.
948
* If they are compatible, means concatenatable,
949
* returns an encoding which the concatinated string will be.
1033
* If they are compatible, means concatenatable,
1034
* returns an encoding which the concatenated string will be.
950
1035
* If they are not compatible, nil is returned.
952
1037
* Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
953
* => #<Encoding:ISO-8859-1>
1038
* #=> #<Encoding:ISO-8859-1>
955
1040
* Encoding.compatible?(
956
1041
* "\xa1".force_encoding("iso-8859-1"),
957
1042
* "\xa1\xa1".force_encoding("euc-jp"))
1062
1147
char cp[sizeof(int) * 8 / 3 + 4];
1063
1148
snprintf(cp, sizeof cp, "CP%d", AreFileApisANSI() ? GetACP() : GetOEMCP());
1064
1149
idx = rb_enc_find_index(cp);
1065
#elif defined __APPLE__
1066
idx = rb_enc_to_index(rb_enc_find("UTF8-MAC"));
1150
if (idx < 0) idx = rb_ascii8bit_encindex();
1068
1152
idx = rb_enc_to_index(rb_default_external_encoding());
1071
if (rb_enc_registered("filesystem") < 0) enc_alias_internal("filesystem", idx);
1155
enc_alias_internal("filesystem", idx);
1160
rb_filesystem_encindex(void)
1162
int idx = rb_enc_registered("filesystem");
1164
idx = rb_ascii8bit_encindex();
1104
1199
enc_alias_internal(name, def->index);
1202
if (def == &default_external)
1203
enc_set_filesystem_encoding();
1107
1205
return overridden;
1110
static struct default_encoding default_external = {0};
1113
1209
rb_default_external_encoding(void)
1226
* Encoding.locale_charmap => string
1322
* Encoding.locale_charmap -> string
1228
1324
* Returns the locale charmap name.
1230
1326
* Debian GNU/Linux
1232
* Encoding.locale_charmap => "ANSI_X3.4-1968"
1328
* Encoding.locale_charmap #=> "ANSI_X3.4-1968"
1233
1329
* LANG=ja_JP.EUC-JP
1234
* Encoding.locale_charmap => "EUC-JP"
1330
* Encoding.locale_charmap #=> "EUC-JP"
1238
* Encoding.locale_charmap => "646"
1334
* Encoding.locale_charmap #=> "646"
1240
* Encoding.locale_charmap => "eucJP"
1336
* Encoding.locale_charmap #=> "eucJP"
1242
* The result is higly platform dependent.
1338
* The result is highly platform dependent.
1243
1339
* So Encoding.find(Encoding.locale_charmap) may cause an error.
1244
1340
* If you need some encoding object even for unknown locale,
1245
1341
* Encoding.find("locale") can be used.
1382
if (s - name > ENCODING_NAMELEN_MAX) return;
1287
1384
rb_define_const(rb_cEncoding, name, encoding);
1289
1386
if (!valid || haslower) {
1290
int len = strlen(name) + 1;
1387
size_t len = s - name;
1388
if (len > ENCODING_NAMELEN_MAX) return;
1291
1389
if (!haslower || !hasupper) {
1293
1391
if (ISLOWER(*s)) haslower = 1;
1294
1392
if (ISUPPER(*s)) hasupper = 1;
1295
1393
} while (*++s && (!haslower || !hasupper));
1397
if (len++ > ENCODING_NAMELEN_MAX) return;
1297
1398
MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1329
* Encoding.name_list => ["enc1", "enc2", ...]
1430
* Encoding.name_list -> ["enc1", "enc2", ...]
1331
1432
* Returns the list of available encoding names.
1333
1434
* Encoding.name_list
1334
* => ["US-ASCII", "ASCII-8BIT", "UTF-8",
1335
* "ISO-8859-1", "Shift_JIS", "EUC-JP",
1337
* "BINARY", "CP932", "eucJP"]
1435
* #=> ["US-ASCII", "ASCII-8BIT", "UTF-8",
1436
* "ISO-8859-1", "Shift_JIS", "EUC-JP",
1438
* "BINARY", "CP932", "eucJP"]
1376
* Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
1477
* Encoding.aliases -> {"alias1" => "orig1", "alias2" => "orig2", ...}
1378
1479
* Returns the hash of available encoding alias and original encoding name.
1380
1481
* Encoding.aliases
1381
* => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
1382
* "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1482
* #=> {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
1483
* "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1404
1505
rb_cEncoding = rb_define_class("Encoding", rb_cObject);
1405
1506
rb_undef_alloc_func(rb_cEncoding);
1507
rb_undef_method(CLASS_OF(rb_cEncoding), "new");
1406
1508
rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
1407
1509
rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
1408
1510
rb_define_method(rb_cEncoding, "name", enc_name, 0);
1409
1511
rb_define_method(rb_cEncoding, "names", enc_names, 0);
1410
1512
rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
1513
rb_define_method(rb_cEncoding, "ascii_compatible?", enc_ascii_compatible_p, 0);
1514
rb_define_method(rb_cEncoding, "replicate", enc_replicate, 1);
1411
1515
rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
1412
1516
rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
1413
1517
rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);