1
# -*- coding: utf-8 -*-
5
class TestUnicodeEscape < Test::Unit::TestCase
7
assert_equal('Matz - 松本行弘',
8
"Matz - \u677E\u672C\u884C\u5F18")
9
assert_equal('Matz - まつもと ゆきひろ',
10
"Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D")
11
assert_equal('Matz - まつもと ゆきひろ',
12
"Matz - \u{307E}\u{3064}\u{3082}\u{3068} \u{3086}\u{304D}\u{3072}\u{308D}")
13
assert_equal('Matz - まつもと ゆきひろ',
14
"Matz - \u{307E 3064 3082 3068 20 3086 304D 3072 308D}")
15
assert_equal("Aoyama Gakuin University - \xE9\x9D\x92\xE5\xB1\xB1\xE5\xAD\xA6\xE9\x99\xA2\xE5\xA4\xA7\xE5\xAD\xA6",
16
"Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66")
17
assert_equal('Aoyama Gakuin University - 青山学院大学',
18
"Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66")
19
assert_equal('青山学院大学', "\u9752\u5C71\u5B66\u9662\u5927\u5B66")
20
assert_equal("Martin D\xC3\xBCrst", "Martin D\u00FCrst")
21
assert_equal('Martin Dürst', "Martin D\u00FCrst")
22
assert_equal('ü', "\u00FC")
23
assert_equal("Martin D\xC3\xBCrst", "Martin D\u{FC}rst")
24
assert_equal('Martin Dürst', "Martin D\u{FC}rst")
25
assert_equal('ü', "\u{FC}")
26
assert_equal('ü', %Q|\u{FC}|)
27
assert_equal('ü', %W{\u{FC}}[0])
29
# \u escapes in here documents
30
assert_equal('Matz - まつもと ゆきひろ', <<EOS.chop)
31
Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D
34
assert_equal('Matz - まつもと ゆきひろ', <<"EOS".chop)
35
Matz - \u{307E 3064 3082 3068} \u{3086 304D 3072 308D}
37
assert_not_equal('Matz - まつもと ゆきひろ', <<'EOS'.chop)
38
Matz - \u{307E 3064 3082 3068} \u{3086 304D 3072 308D}
41
# single-quoted things don't expand \u
42
assert_not_equal('ü', '\u{FC}')
43
assert_not_equal('ü', %q|\u{FC}|)
44
assert_not_equal('ü', %w{\u{FC}}[0])
45
assert_equal('\u00fc', "\\" + "u00fc")
48
assert_equal(`echo "\u0041"`.chop, "A")
49
assert_equal(%x{echo "\u0041"}.chop, "A")
50
assert_equal(`echo "\u{FC}"`.force_encoding("utf-8"), "ü\n")
52
# \u in quoted symbols
53
assert_equal(:A, :"\u0041")
54
assert_equal(:a, :"\u0061")
57
assert_equal(:"\u{41}", :"\u0041")
58
assert_equal(:ü, :"\u{fc}")
60
# the NUL character is not allowed in symbols
61
assert_raise(SyntaxError) { eval %q(:"\u{0}")}
62
assert_raise(SyntaxError) { eval %q(:"\u0000")}
63
assert_raise(SyntaxError) { eval %q(:"\u{fc 0 0041}")}
64
assert_raise(SyntaxError) { eval %q(:"\x00")}
65
assert_raise(SyntaxError) { eval %q(:"\0")}
70
# Compare regexps to regexps
71
assert_not_equal(/Yukihiro Matsumoto - 松本行弘/,
72
/Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/)
73
assert_not_equal(/Yukihiro Matsumoto - 松本行弘/,
74
/Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
75
assert_not_equal(/Matz - まつもと ゆきひろ/,
76
/Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/)
77
assert_not_equal(/Aoyama Gakuin University - 青山学院大学/,
78
/Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/)
79
assert_not_equal(/青山学院大学/, /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
80
assert_not_equal(/Martin Dürst/, /Martin D\u00FCrst/)
81
assert_not_equal(/ü/, /\u00FC/)
82
assert_not_equal(/Martin Dürst/, /Martin D\u{FC}rst/)
83
assert_not_equal(/ü/, /\u{FC}/)
84
assert_not_equal(/ü/, %r{\u{FC}})
85
assert_not_equal(/ü/i, %r{\u00FC}i)
87
assert_equal('Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18',
88
/Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/.source)
89
assert_equal('Yukihiro Matsumoto - \u{677E 672C 884C 5F18}',
90
/Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/.source)
91
assert_equal('Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D',
92
/Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/.source)
93
assert_equal('Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66',
94
/Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/.source)
95
assert_equal('\u9752\u5C71\u5B66\u9662\u5927\u5B66',
96
/\u9752\u5C71\u5B66\u9662\u5927\u5B66/.source)
97
assert_equal('Martin D\u00FCrst', /Martin D\u00FCrst/.source)
98
assert_equal('\u00FC', /\u00FC/.source)
99
assert_equal('Martin D\u{FC}rst', /Martin D\u{FC}rst/.source)
100
assert_equal('\u{FC}', /\u{FC}/.source)
101
assert_equal('\u{FC}', %r{\u{FC}}.source)
102
assert_equal('\u00FC', %r{\u00FC}i.source)
104
# match strings to regexps
105
assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18/)
106
assert_equal(0, "Yukihiro Matsumoto - \u677E\u672C\u884C\u5F18" =~ /Yukihiro Matsumoto - \u677E\u672C\u884C/)
107
assert_equal(0, "Yukihiro Matsumoto - 松本行弘" =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
108
assert_equal(0, %Q{Yukihiro Matsumoto - \u{677E 672C 884C 5F18}} =~ /Yukihiro Matsumoto - \u{677E 672C 884C 5F18}/)
109
assert_equal(0, "Matz - まつもと ゆきひろ" =~ /Matz - \u307E\u3064\u3082\u3068 \u3086\u304D\u3072\u308D/)
110
assert_equal(0, "Aoyama Gakuin University - 青山学院大学" =~ /Aoyama Gakuin University - \u9752\u5C71\u5B66\u9662\u5927\u5B66/)
111
assert_equal(0, "青山学院大学" =~ /\u9752\u5C71\u5B66\u9662\u5927\u5B66/)
112
assert_equal(0, "Martin Dürst" =~ /Martin D\u00FCrst/)
113
assert_equal(0, "ü" =~ /\u00FC/)
114
assert_equal(0, "Martin Dürst" =~ /Martin D\u{FC}rst/)
115
assert_equal(0, "ü" =~ %r{\u{FC}})
116
assert_equal(0, "ü" =~ %r{\u00FC}i)
118
# Flip order of the two operands
119
assert_equal(0, /Martin D\u00FCrst/ =~ "Martin Dürst")
120
assert_equal(4, /\u00FC/ =~ "testü")
121
assert_equal(3, /Martin D\u{FC}rst/ =~ "fooMartin Dürstbar")
122
assert_equal(3, %r{\u{FC}} =~ "fooübar")
124
# Put \u in strings, literal character in regexp
125
assert_equal(0, "Martin D\u00FCrst" =~ /Martin Dürst/)
126
assert_equal(4, "test\u00FC" =~ /ü/)
127
assert_equal(3, "fooMartin D\u{FC}rstbar" =~ /Martin Dürst/)
128
assert_equal(3, %Q{foo\u{FC}bar} =~ %r<ü>)
130
assert_match(eval('/\u{2a}/'), "*")
131
assert_raise(SyntaxError) { eval('/\u{6666}/n') }
132
assert_raise(SyntaxError) { eval('/\u{6666}/e') }
133
assert_raise(SyntaxError) { eval('/\u{6666}/s') }
134
assert_nothing_raised { eval('/\u{6666}/u') }
137
def test_dynamic_regexp
138
assert_match(Regexp.new("Martin D\\u{FC}rst"), "Martin Dürst")
141
def test_syntax_variants
143
assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89AB\uCDEF")
144
assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89AB\uCDEF")
145
assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89ab\ucdef")
146
assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89ab\ucdef")
147
assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89aB\uCdEf")
148
assert_equal("\xC4\xA3\xE4\x95\xA7\xE8\xA6\xAB\xEC\xB7\xAF", "\u0123\u4567\u89aB\ucDEF")
152
# examples from Hal Fulton's book (second edition), chapter 4
154
assert_equal('épée', "\u00E9\u0070\u00E9\u0065")
155
assert_equal('épée', "\u00E9p\u00E9e")
156
assert_equal("\xC3\xA9\x70\xC3\xA9\x65", "\u00E9\u0070\u00E9\u0065")
157
assert_equal("\xC3\xA9\x70\xC3\xA9\x65", "\u00E9p\u00E9e")
159
assert_equal('épée', "\u0065\u0301\u0070\u0065\u0301\u0065")
160
assert_equal('épée', "e\u0301pe\u0301e")
161
assert_equal("\x65\xCC\x81\x70\x65\xCC\x81\x65", "\u0065\u0301\u0070\u0065\u0301\u0065")
162
assert_equal("\x65\xCC\x81\x70\x65\xCC\x81\x65", "e\u0301pe\u0301e")
163
# combinations of NFC/D, NFKC/D
164
assert_equal('öffnen', "\u00F6\u0066\u0066\u006E\u0065\u006E")
165
assert_equal("\xC3\xB6ffnen", "\u00F6\u0066\u0066\u006E\u0065\u006E")
166
assert_equal('öffnen', "\u00F6ffnen")
167
assert_equal("\xC3\xB6ffnen", "\u00F6ffnen")
168
assert_equal('öffnen', "\u006F\u0308\u0066\u0066\u006E\u0065\u006E")
169
assert_equal("\x6F\xCC\x88ffnen", "\u006F\u0308\u0066\u0066\u006E\u0065\u006E")
170
assert_equal('öffnen', "o\u0308ffnen")
171
assert_equal("\x6F\xCC\x88ffnen", "o\u0308ffnen")
172
assert_equal('öffnen', "\u00F6\uFB00\u006E\u0065\u006E")
173
assert_equal("\xC3\xB6\xEF\xAC\x80nen", "\u00F6\uFB00\u006E\u0065\u006E")
174
assert_equal('öffnen', "\u00F6\uFB00nen")
175
assert_equal("\xC3\xB6\xEF\xAC\x80nen", "\u00F6\uFB00nen")
176
assert_equal('öffnen', "\u006F\u0308\uFB00\u006E\u0065\u006E")
177
assert_equal("\x6F\xCC\x88\xEF\xAC\x80nen", "\u006F\u0308\uFB00\u006E\u0065\u006E")
178
assert_equal('öffnen', "o\u0308\uFB00nen")
179
assert_equal("\x6F\xCC\x88\xEF\xAC\x80nen", "o\u0308\uFB00nen")
180
# German sharp s (sz)
181
assert_equal('Straße', "\u0053\u0074\u0072\u0061\u00DF\u0065")
182
assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "\u0053\u0074\u0072\u0061\u00DF\u0065")
183
assert_equal('Straße', "Stra\u00DFe")
184
assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "Stra\u00DFe")
185
assert_equal('Straße', "\u{53}\u{74}\u{72}\u{61}\u{DF}\u{65}")
186
assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "\u{53}\u{74}\u{72}\u{61}\u{DF}\u{65}")
187
assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "\u{53 74 72 61 DF 65}")
188
assert_equal('Straße', "Stra\u{DF}e")
189
assert_equal("\x53\x74\x72\x61\xC3\x9F\x65", "Stra\u{DF}e")
193
# start and end of each outer plane
194
assert_equal("\xF4\x8F\xBF\xBF", "\u{10FFFF}")
195
assert_equal("\xF4\x80\x80\x80", "\u{100000}")
196
assert_equal("\xF3\xBF\xBF\xBF", "\u{FFFFF}")
197
assert_equal("\xF3\xB0\x80\x80", "\u{F0000}")
198
assert_equal("\xF3\xAF\xBF\xBF", "\u{EFFFF}")
199
assert_equal("\xF3\xA0\x80\x80", "\u{E0000}")
200
assert_equal("\xF3\x9F\xBF\xBF", "\u{DFFFF}")
201
assert_equal("\xF3\x90\x80\x80", "\u{D0000}")
202
assert_equal("\xF3\x8F\xBF\xBF", "\u{CFFFF}")
203
assert_equal("\xF3\x80\x80\x80", "\u{C0000}")
204
assert_equal("\xF2\xBF\xBF\xBF", "\u{BFFFF}")
205
assert_equal("\xF2\xB0\x80\x80", "\u{B0000}")
206
assert_equal("\xF2\xAF\xBF\xBF", "\u{AFFFF}")
207
assert_equal("\xF2\xA0\x80\x80", "\u{A0000}")
208
assert_equal("\xF2\x9F\xBF\xBF", "\u{9FFFF}")
209
assert_equal("\xF2\x90\x80\x80", "\u{90000}")
210
assert_equal("\xF2\x8F\xBF\xBF", "\u{8FFFF}")
211
assert_equal("\xF2\x80\x80\x80", "\u{80000}")
212
assert_equal("\xF1\xBF\xBF\xBF", "\u{7FFFF}")
213
assert_equal("\xF1\xB0\x80\x80", "\u{70000}")
214
assert_equal("\xF1\xAF\xBF\xBF", "\u{6FFFF}")
215
assert_equal("\xF1\xA0\x80\x80", "\u{60000}")
216
assert_equal("\xF1\x9F\xBF\xBF", "\u{5FFFF}")
217
assert_equal("\xF1\x90\x80\x80", "\u{50000}")
218
assert_equal("\xF1\x8F\xBF\xBF", "\u{4FFFF}")
219
assert_equal("\xF1\x80\x80\x80", "\u{40000}")
220
assert_equal("\xF0\xBF\xBF\xBF", "\u{3FFFF}")
221
assert_equal("\xF0\xB0\x80\x80", "\u{30000}")
222
assert_equal("\xF0\xAF\xBF\xBF", "\u{2FFFF}")
223
assert_equal("\xF0\xA0\x80\x80", "\u{20000}")
224
assert_equal("\xF0\x9F\xBF\xBF", "\u{1FFFF}")
225
assert_equal("\xF0\x90\x80\x80", "\u{10000}")
227
assert_equal("\xEF\xBF\xBF", "\uFFFF")
228
assert_equal("\xEE\x80\x80", "\uE000")
229
assert_equal("\xED\x9F\xBF", "\uD7FF")
230
assert_equal("\xE0\xA0\x80", "\u0800")
231
assert_equal("\xDF\xBF", "\u07FF")
232
assert_equal("\xC2\x80", "\u0080")
233
assert_equal("\x7F", "\u007F")
234
assert_equal("\x00", "\u0000")
238
assert_equal(?\u0041, ?A)
239
assert_equal(?\u{79}, ?\x79)
240
assert_equal(?\u{0}, ?\000)
241
assert_equal(?\u0000, ?\000)
244
# Tests to make sure that disallowed cases fail
246
assert_raise(SyntaxError) { eval %q("\uabc") } # too short
247
assert_raise(SyntaxError) { eval %q("\uab") } # too short
248
assert_raise(SyntaxError) { eval %q("\ua") } # too short
249
assert_raise(SyntaxError) { eval %q("\u") } # too short
250
assert_raise(SyntaxError) { eval %q("\u{110000}") } # too high
251
assert_raise(SyntaxError) { eval %q("\u{abcdeff}") } # too long
252
assert_raise(SyntaxError) { eval %q("\ughij") } # bad hex digits
253
assert_raise(SyntaxError) { eval %q("\u{ghij}") } # bad hex digits
255
assert_raise(SyntaxError) { eval %q("\u{123 456 }")} # extra space
256
assert_raise(SyntaxError) { eval %q("\u{ 123 456}")} # extra space
257
assert_raise(SyntaxError) { eval %q("\u{123 456}")} # extra space
259
# The utf-8 encoding object currently does not object to codepoints
260
# in the surrogate blocks, so these do not raise an error.
261
# assert_raise(SyntaxError) { "\uD800" } # surrogate block
262
# assert_raise(SyntaxError) { "\uDCBA" } # surrogate block
263
# assert_raise(SyntaxError) { "\uDFFF" } # surrogate block
264
# assert_raise(SyntaxError) { "\uD847\uDD9A" } # surrogate pair