1
# XSD4R - Charset handling library.
2
# Copyright (C) 2001, 2003, 2005 NAKAMURA, Hiroshi <nahi@ruby-lang.org>.
4
# This program is copyrighted free software by NAKAMURA, Hiroshi. You can
5
# redistribute it and/or modify it under the same terms of Ruby's license;
6
# either the dual license version in 2003, or any later version.
13
@internal_encoding = $KCODE
15
class XSDError < StandardError; end
16
class CharsetError < XSDError; end
17
class UnknownCharsetError < CharsetError; end
18
class CharsetConversionError < CharsetError; end
25
EncodingConvertMap = {}
27
EncodingConvertMap[['UTF8', 'X_ISO8859_1']] =
28
Proc.new { |str| str.unpack('U*').pack('C*') }
29
EncodingConvertMap[['X_ISO8859_1', 'UTF8']] =
30
Proc.new { |str| str.unpack('C*').pack('U*') }
32
require 'xsd/iconvcharset'
33
@internal_encoding = 'UTF8'
34
sjtag = (/(mswin|bccwin|mingw|cygwin|emx)/ =~ RUBY_PLATFORM) ? 'cp932' :
36
EncodingConvertMap[['UTF8', 'EUC' ]] =
37
Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) }
38
EncodingConvertMap[['EUC' , 'UTF8']] =
39
Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) }
40
EncodingConvertMap[['EUC' , 'SJIS']] =
41
Proc.new { |str| IconvCharset.safe_iconv(sjtag, "euc-jp", str) }
42
EncodingConvertMap[['UTF8', 'SJIS']] =
43
Proc.new { |str| IconvCharset.safe_iconv(sjtag, "utf-8", str) }
44
EncodingConvertMap[['SJIS', 'UTF8']] =
45
Proc.new { |str| IconvCharset.safe_iconv("utf-8", sjtag, str) }
46
EncodingConvertMap[['SJIS', 'EUC' ]] =
47
Proc.new { |str| IconvCharset.safe_iconv("euc-jp", sjtag, str) }
51
EncodingConvertMap[['EUC' , 'SJIS']] =
52
Proc.new { |str| NKF.nkf('-sXm0', str) }
53
EncodingConvertMap[['SJIS', 'EUC' ]] =
54
Proc.new { |str| NKF.nkf('-eXm0', str) }
60
@internal_encoding = 'UTF8'
61
EncodingConvertMap[['UTF8', 'EUC' ]] = Uconv.method(:u8toeuc)
62
EncodingConvertMap[['UTF8', 'SJIS']] = Uconv.method(:u8tosjis)
63
EncodingConvertMap[['EUC' , 'UTF8']] = Uconv.method(:euctou8)
64
EncodingConvertMap[['SJIS', 'UTF8']] = Uconv.method(:sjistou8)
74
'SJIS' => 'shift_jis',
76
'X_ISO_8859_1' => 'iso-8859-1',
88
def Charset.encoding=(encoding)
89
warn("xsd charset is set to #{encoding}") if $DEBUG
90
@internal_encoding = encoding
93
def Charset.xml_encoding_label
94
charset_label(@internal_encoding)
97
def Charset.encoding_to_xml(str, charset)
98
encoding_conv(str, @internal_encoding, charset_str(charset))
101
def Charset.encoding_from_xml(str, charset)
102
encoding_conv(str, charset_str(charset), @internal_encoding)
105
def Charset.encoding_conv(str, enc_from, enc_to)
106
if enc_from == enc_to or enc_from == 'NONE' or enc_to == 'NONE'
108
elsif converter = EncodingConvertMap[[enc_from, enc_to]]
111
raise CharsetConversionError.new(
112
"Converter not found: #{enc_from} -> #{enc_to}")
116
def Charset.charset_label(encoding)
117
CharsetMap[encoding.upcase]
120
def Charset.charset_str(label)
121
if CharsetMap.respond_to?(:key)
122
CharsetMap.key(label.downcase) || 'X_UNKNOWN'
124
CharsetMap.index(label.downcase) || 'X_UNKNOWN'
128
# us_ascii = '[\x00-\x7F]'
129
us_ascii = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted.
130
USASCIIRegexp = Regexp.new("\\A#{us_ascii}*\\z", nil, "NONE")
132
twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'
133
threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'
134
character_euc = "(?:#{us_ascii}|#{twobytes_euc}|#{threebytes_euc})"
135
EUCRegexp = Regexp.new("\\A#{character_euc}*\\z", nil, "NONE")
137
# onebyte_sjis = '[\x00-\x7F\xA1-\xDF]'
138
onebyte_sjis = '[\x9\xa\xd\x20-\x7F\xA1-\xDF]' # XML 1.0 restricted.
139
twobytes_sjis = '(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])'
140
character_sjis = "(?:#{onebyte_sjis}|#{twobytes_sjis})"
141
SJISRegexp = Regexp.new("\\A#{character_sjis}*\\z", nil, "NONE")
145
twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])'
146
# 1110zzzz 10yyyyyy 10xxxxxx
147
threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])'
148
# 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx
149
fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])'
151
"(?:#{us_ascii}|#{twobytes_utf8}|#{threebytes_utf8}|#{fourbytes_utf8})"
152
UTF8Regexp = Regexp.new("\\A#{character_utf8}*\\z", nil, "NONE")
154
def Charset.is_us_ascii(str)
158
def Charset.is_utf8(str)
162
def Charset.is_euc(str)
166
def Charset.is_sjis(str)
170
def Charset.is_ces(str, code = $KCODE)
181
raise UnknownCharsetError.new("Unknown charset: #{code}")