1
""" Test script for the Unicode implementation.
3
Written by Marc-Andre Lemburg (mal@lemburg.com).
5
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
15
from test import support, string_tests
17
# Error handling (bad decoder return)
18
def search_function(encoding):
19
def decode1(input, errors="strict"):
20
return 42 # not a tuple
21
def encode1(input, errors="strict"):
22
return 42 # not a tuple
23
def encode2(input, errors="strict"):
24
return (42, 42) # no unicode
25
def decode2(input, errors="strict"):
26
return (42, 42) # no unicode
27
if encoding=="test.unicode1":
28
return (encode1, decode1, None, None)
29
elif encoding=="test.unicode2":
30
return (encode2, decode2, None, None)
33
codecs.register(search_function)
35
def duplicate_string(text):
37
Try to get a fresh clone of the specified text:
38
new object with a reference count of 1.
40
This is a best-effort: latin1 single letters and the empty
41
string ('') are singletons and cannot be cloned.
43
return text.encode().decode()
45
class UnicodeTest(string_tests.CommonTest,
46
string_tests.MixinStrUnicodeUserStringTest,
47
string_tests.MixinStrUnicodeTest,
52
def checkequalnofix(self, result, object, methodname, *args):
53
method = getattr(object, methodname)
54
realresult = method(*args)
55
self.assertEqual(realresult, result)
56
self.assertTrue(type(realresult) is type(result))
58
# if the original is returned make sure that
59
# this doesn't happen with subclasses
60
if realresult is object:
63
return 'usub(%r)' % str.__repr__(self)
65
method = getattr(object, methodname)
66
realresult = method(*args)
67
self.assertEqual(realresult, result)
68
self.assertTrue(object is not realresult)
70
def test_literals(self):
71
self.assertEqual('\xff', '\u00ff')
72
self.assertEqual('\uffff', '\U0000ffff')
73
self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
74
self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
75
self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
76
# raw strings should not have unicode escapes
77
self.assertNotEqual(r"\u0020", " ")
80
if not sys.platform.startswith('java'):
81
# Test basic sanity of repr()
82
self.assertEqual(ascii('abc'), "'abc'")
83
self.assertEqual(ascii('ab\\c'), "'ab\\\\c'")
84
self.assertEqual(ascii('ab\\'), "'ab\\\\'")
85
self.assertEqual(ascii('\\c'), "'\\\\c'")
86
self.assertEqual(ascii('\\'), "'\\\\'")
87
self.assertEqual(ascii('\n'), "'\\n'")
88
self.assertEqual(ascii('\r'), "'\\r'")
89
self.assertEqual(ascii('\t'), "'\\t'")
90
self.assertEqual(ascii('\b'), "'\\x08'")
91
self.assertEqual(ascii("'\""), """'\\'"'""")
92
self.assertEqual(ascii("'\""), """'\\'"'""")
93
self.assertEqual(ascii("'"), '''"'"''')
94
self.assertEqual(ascii('"'), """'"'""")
96
"'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
97
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
98
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
99
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
100
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
101
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
102
"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
103
"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
104
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
105
"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
106
"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
107
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
108
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
110
testrepr = ascii(''.join(map(chr, range(256))))
111
self.assertEqual(testrepr, latin1repr)
112
# Test ascii works on wide unicode escapes without overflow.
113
self.assertEqual(ascii("\U00010000" * 39 + "\uffff" * 4096),
114
ascii("\U00010000" * 39 + "\uffff" * 4096))
119
self.assertRaises(TypeError, ascii, WrongRepr())
122
if not sys.platform.startswith('java'):
123
# Test basic sanity of repr()
124
self.assertEqual(repr('abc'), "'abc'")
125
self.assertEqual(repr('ab\\c'), "'ab\\\\c'")
126
self.assertEqual(repr('ab\\'), "'ab\\\\'")
127
self.assertEqual(repr('\\c'), "'\\\\c'")
128
self.assertEqual(repr('\\'), "'\\\\'")
129
self.assertEqual(repr('\n'), "'\\n'")
130
self.assertEqual(repr('\r'), "'\\r'")
131
self.assertEqual(repr('\t'), "'\\t'")
132
self.assertEqual(repr('\b'), "'\\x08'")
133
self.assertEqual(repr("'\""), """'\\'"'""")
134
self.assertEqual(repr("'\""), """'\\'"'""")
135
self.assertEqual(repr("'"), '''"'"''')
136
self.assertEqual(repr('"'), """'"'""")
138
"'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
139
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
140
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
141
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
142
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
143
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
144
"\\x9c\\x9d\\x9e\\x9f\\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9"
145
"\xaa\xab\xac\\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
146
"\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5"
147
"\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3"
148
"\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1"
149
"\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
150
"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd"
152
testrepr = repr(''.join(map(chr, range(256))))
153
self.assertEqual(testrepr, latin1repr)
154
# Test repr works on wide unicode escapes without overflow.
155
self.assertEqual(repr("\U00010000" * 39 + "\uffff" * 4096),
156
repr("\U00010000" * 39 + "\uffff" * 4096))
161
self.assertRaises(TypeError, repr, WrongRepr())
163
def test_iterators(self):
164
# Make sure unicode objects have an __iter__ method
165
it = "\u1111\u2222\u3333".__iter__()
166
self.assertEqual(next(it), "\u1111")
167
self.assertEqual(next(it), "\u2222")
168
self.assertEqual(next(it), "\u3333")
169
self.assertRaises(StopIteration, next, it)
171
def test_count(self):
172
string_tests.CommonTest.test_count(self)
173
# check mixed argument types
174
self.checkequalnofix(3, 'aaa', 'count', 'a')
175
self.checkequalnofix(0, 'aaa', 'count', 'b')
176
self.checkequalnofix(3, 'aaa', 'count', 'a')
177
self.checkequalnofix(0, 'aaa', 'count', 'b')
178
self.checkequalnofix(0, 'aaa', 'count', 'b')
179
self.checkequalnofix(1, 'aaa', 'count', 'a', -1)
180
self.checkequalnofix(3, 'aaa', 'count', 'a', -10)
181
self.checkequalnofix(2, 'aaa', 'count', 'a', 0, -1)
182
self.checkequalnofix(0, 'aaa', 'count', 'a', 0, -10)
185
string_tests.CommonTest.test_find(self)
186
# test implementation details of the memchr fast path
187
self.checkequal(100, 'a' * 100 + '\u0102', 'find', '\u0102')
188
self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0201')
189
self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0120')
190
self.checkequal(-1, 'a' * 100 + '\u0102', 'find', '\u0220')
191
self.checkequal(100, 'a' * 100 + '\U00100304', 'find', '\U00100304')
192
self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00100204')
193
self.checkequal(-1, 'a' * 100 + '\U00100304', 'find', '\U00102004')
194
# check mixed argument types
195
self.checkequalnofix(0, 'abcdefghiabc', 'find', 'abc')
196
self.checkequalnofix(9, 'abcdefghiabc', 'find', 'abc', 1)
197
self.checkequalnofix(-1, 'abcdefghiabc', 'find', 'def', 4)
199
self.assertRaises(TypeError, 'hello'.find)
200
self.assertRaises(TypeError, 'hello'.find, 42)
202
def test_rfind(self):
203
string_tests.CommonTest.test_rfind(self)
204
# test implementation details of the memrchr fast path
205
self.checkequal(0, '\u0102' + 'a' * 100 , 'rfind', '\u0102')
206
self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0201')
207
self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0120')
208
self.checkequal(-1, '\u0102' + 'a' * 100 , 'rfind', '\u0220')
209
self.checkequal(0, '\U00100304' + 'a' * 100, 'rfind', '\U00100304')
210
self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00100204')
211
self.checkequal(-1, '\U00100304' + 'a' * 100, 'rfind', '\U00102004')
212
# check mixed argument types
213
self.checkequalnofix(9, 'abcdefghiabc', 'rfind', 'abc')
214
self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
215
self.checkequalnofix(12, 'abcdefghiabc', 'rfind', '')
217
def test_index(self):
218
string_tests.CommonTest.test_index(self)
219
self.checkequalnofix(0, 'abcdefghiabc', 'index', '')
220
self.checkequalnofix(3, 'abcdefghiabc', 'index', 'def')
221
self.checkequalnofix(0, 'abcdefghiabc', 'index', 'abc')
222
self.checkequalnofix(9, 'abcdefghiabc', 'index', 'abc', 1)
223
self.assertRaises(ValueError, 'abcdefghiabc'.index, 'hib')
224
self.assertRaises(ValueError, 'abcdefghiab'.index, 'abc', 1)
225
self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', 8)
226
self.assertRaises(ValueError, 'abcdefghi'.index, 'ghi', -1)
228
def test_rindex(self):
229
string_tests.CommonTest.test_rindex(self)
230
self.checkequalnofix(12, 'abcdefghiabc', 'rindex', '')
231
self.checkequalnofix(3, 'abcdefghiabc', 'rindex', 'def')
232
self.checkequalnofix(9, 'abcdefghiabc', 'rindex', 'abc')
233
self.checkequalnofix(0, 'abcdefghiabc', 'rindex', 'abc', 0, -1)
235
self.assertRaises(ValueError, 'abcdefghiabc'.rindex, 'hib')
236
self.assertRaises(ValueError, 'defghiabc'.rindex, 'def', 1)
237
self.assertRaises(ValueError, 'defghiabc'.rindex, 'abc', 0, -1)
238
self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, 8)
239
self.assertRaises(ValueError, 'abcdefghi'.rindex, 'ghi', 0, -1)
241
def test_maketrans_translate(self):
242
# these work with plain translate()
243
self.checkequalnofix('bbbc', 'abababc', 'translate',
245
self.checkequalnofix('iiic', 'abababc', 'translate',
246
{ord('a'): None, ord('b'): ord('i')})
247
self.checkequalnofix('iiix', 'abababc', 'translate',
248
{ord('a'): None, ord('b'): ord('i'), ord('c'): 'x'})
249
self.checkequalnofix('c', 'abababc', 'translate',
250
{ord('a'): None, ord('b'): ''})
251
self.checkequalnofix('xyyx', 'xzx', 'translate',
253
# this needs maketrans()
254
self.checkequalnofix('abababc', 'abababc', 'translate',
256
tbl = self.type2test.maketrans({'a': None, 'b': '<i>'})
257
self.checkequalnofix('<i><i><i>c', 'abababc', 'translate', tbl)
258
# test alternative way of calling maketrans()
259
tbl = self.type2test.maketrans('abc', 'xyz', 'd')
260
self.checkequalnofix('xyzzy', 'abdcdcbdddd', 'translate', tbl)
262
self.assertRaises(TypeError, self.type2test.maketrans)
263
self.assertRaises(ValueError, self.type2test.maketrans, 'abc', 'defg')
264
self.assertRaises(TypeError, self.type2test.maketrans, 2, 'def')
265
self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 2)
266
self.assertRaises(TypeError, self.type2test.maketrans, 'abc', 'def', 2)
267
self.assertRaises(ValueError, self.type2test.maketrans, {'xy': 2})
268
self.assertRaises(TypeError, self.type2test.maketrans, {(1,): 2})
270
self.assertRaises(TypeError, 'hello'.translate)
271
self.assertRaises(TypeError, 'abababc'.translate, 'abc', 'xyz')
273
def test_split(self):
274
string_tests.CommonTest.test_split(self)
277
self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
278
self.checkequalnofix(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
279
self.checkequalnofix(['endcase ', ''], 'endcase test', 'split', 'test')
282
string_tests.MixinStrUnicodeUserStringTest.test_join(self)
285
def __init__(self, sval): self.sval = sval
286
def __str__(self): return self.sval
289
self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
290
self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
291
self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
292
self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
293
self.checkequalnofix('a b c d', ' ', 'join', ['a', 'b', 'c', 'd'])
294
self.checkequalnofix('abcd', '', 'join', ('a', 'b', 'c', 'd'))
295
self.checkequalnofix('w x y z', ' ', 'join', string_tests.Sequence('wxyz'))
296
self.checkraises(TypeError, ' ', 'join', ['1', '2', MyWrapper('foo')])
297
self.checkraises(TypeError, ' ', 'join', ['1', '2', '3', bytes()])
298
self.checkraises(TypeError, ' ', 'join', [1, 2, 3])
299
self.checkraises(TypeError, ' ', 'join', ['1', '2', 3])
301
def test_replace(self):
302
string_tests.CommonTest.test_replace(self)
304
# method call forwarded from str implementation because of unicode argument
305
self.checkequalnofix('one@two!three!', 'one!two!three!', 'replace', '!', '@', 1)
306
self.assertRaises(TypeError, 'replace'.replace, "r", 42)
308
@support.cpython_only
309
def test_replace_id(self):
312
self.assertIs(text.replace(pattern, pattern), text)
314
def test_bytes_comparison(self):
315
with support.check_warnings():
316
warnings.simplefilter('ignore', BytesWarning)
317
self.assertEqual('abc' == b'abc', False)
318
self.assertEqual('abc' != b'abc', True)
319
self.assertEqual('abc' == bytearray(b'abc'), False)
320
self.assertEqual('abc' != bytearray(b'abc'), True)
322
def test_comparison(self):
324
self.assertEqual('abc', 'abc')
325
self.assertTrue('abcd' > 'abc')
326
self.assertTrue('abc' < 'abcd')
329
# Move these tests to a Unicode collation module test...
330
# Testing UTF-16 code point order comparisons...
332
# No surrogates, no fixup required.
333
self.assertTrue('\u0061' < '\u20ac')
334
# Non surrogate below surrogate value, no fixup required
335
self.assertTrue('\u0061' < '\ud800\udc02')
337
# Non surrogate above surrogate value, fixup required
338
def test_lecmp(s, s2):
339
self.assertTrue(s < s2)
378
# Surrogates on both sides, no fixup required
379
self.assertTrue('\ud800\udc02' < '\ud84d\udc56')
381
def test_islower(self):
382
string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
383
self.checkequalnofix(False, '\u1FFc', 'islower')
384
self.assertFalse('\u2167'.islower())
385
self.assertTrue('\u2177'.islower())
387
self.assertFalse('\U00010401'.islower())
388
self.assertFalse('\U00010427'.islower())
390
self.assertTrue('\U00010429'.islower())
391
self.assertTrue('\U0001044E'.islower())
393
self.assertFalse('\U0001F40D'.islower())
394
self.assertFalse('\U0001F46F'.islower())
396
def test_isupper(self):
397
string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
398
if not sys.platform.startswith('java'):
399
self.checkequalnofix(False, '\u1FFc', 'isupper')
400
self.assertTrue('\u2167'.isupper())
401
self.assertFalse('\u2177'.isupper())
403
self.assertTrue('\U00010401'.isupper())
404
self.assertTrue('\U00010427'.isupper())
406
self.assertFalse('\U00010429'.isupper())
407
self.assertFalse('\U0001044E'.isupper())
409
self.assertFalse('\U0001F40D'.isupper())
410
self.assertFalse('\U0001F46F'.isupper())
412
def test_istitle(self):
413
string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
414
self.checkequalnofix(True, '\u1FFc', 'istitle')
415
self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
417
# non-BMP, uppercase + lowercase
418
self.assertTrue('\U00010401\U00010429'.istitle())
419
self.assertTrue('\U00010427\U0001044E'.istitle())
420
# apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
421
for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
422
self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
424
def test_isspace(self):
425
string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
426
self.checkequalnofix(True, '\u2000', 'isspace')
427
self.checkequalnofix(True, '\u200a', 'isspace')
428
self.checkequalnofix(False, '\u2014', 'isspace')
429
# apparently there are no non-BMP spaces chars in Unicode 6
430
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
431
'\U0001F40D', '\U0001F46F']:
432
self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
434
def test_isalnum(self):
435
string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
436
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
437
'\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
438
self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
440
def test_isalpha(self):
441
string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
442
self.checkequalnofix(True, '\u1FFc', 'isalpha')
444
self.assertTrue('\U00010401'.isalpha())
445
self.assertTrue('\U00010427'.isalpha())
446
self.assertTrue('\U00010429'.isalpha())
447
self.assertTrue('\U0001044E'.isalpha())
449
self.assertFalse('\U0001F40D'.isalpha())
450
self.assertFalse('\U0001F46F'.isalpha())
452
def test_isdecimal(self):
453
self.checkequalnofix(False, '', 'isdecimal')
454
self.checkequalnofix(False, 'a', 'isdecimal')
455
self.checkequalnofix(True, '0', 'isdecimal')
456
self.checkequalnofix(False, '\u2460', 'isdecimal') # CIRCLED DIGIT ONE
457
self.checkequalnofix(False, '\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
458
self.checkequalnofix(True, '\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
459
self.checkequalnofix(True, '0123456789', 'isdecimal')
460
self.checkequalnofix(False, '0123456789a', 'isdecimal')
462
self.checkraises(TypeError, 'abc', 'isdecimal', 42)
464
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
465
'\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
466
self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
467
for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
468
self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
470
def test_isdigit(self):
471
string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
472
self.checkequalnofix(True, '\u2460', 'isdigit')
473
self.checkequalnofix(False, '\xbc', 'isdigit')
474
self.checkequalnofix(True, '\u0660', 'isdigit')
476
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
477
'\U0001F40D', '\U0001F46F', '\U00011065']:
478
self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
479
for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
480
self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
482
def test_isnumeric(self):
483
self.checkequalnofix(False, '', 'isnumeric')
484
self.checkequalnofix(False, 'a', 'isnumeric')
485
self.checkequalnofix(True, '0', 'isnumeric')
486
self.checkequalnofix(True, '\u2460', 'isnumeric')
487
self.checkequalnofix(True, '\xbc', 'isnumeric')
488
self.checkequalnofix(True, '\u0660', 'isnumeric')
489
self.checkequalnofix(True, '0123456789', 'isnumeric')
490
self.checkequalnofix(False, '0123456789a', 'isnumeric')
492
self.assertRaises(TypeError, "abc".isnumeric, 42)
494
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
495
'\U0001F40D', '\U0001F46F']:
496
self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
497
for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
498
'\U000104A0', '\U0001F107']:
499
self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
501
def test_isidentifier(self):
502
self.assertTrue("a".isidentifier())
503
self.assertTrue("Z".isidentifier())
504
self.assertTrue("_".isidentifier())
505
self.assertTrue("b0".isidentifier())
506
self.assertTrue("bc".isidentifier())
507
self.assertTrue("b_".isidentifier())
508
self.assertTrue("Āµ".isidentifier())
509
self.assertTrue("šš«š¦š š¬š”š¢".isidentifier())
511
self.assertFalse(" ".isidentifier())
512
self.assertFalse("[".isidentifier())
513
self.assertFalse("Ā©".isidentifier())
514
self.assertFalse("0".isidentifier())
516
def test_isprintable(self):
517
self.assertTrue("".isprintable())
518
self.assertTrue(" ".isprintable())
519
self.assertTrue("abcdefg".isprintable())
520
self.assertFalse("abcdefg\n".isprintable())
521
# some defined Unicode character
522
self.assertTrue("\u0374".isprintable())
523
# undefined character
524
self.assertFalse("\u0378".isprintable())
525
# single surrogate character
526
self.assertFalse("\ud800".isprintable())
528
self.assertTrue('\U0001F46F'.isprintable())
529
self.assertFalse('\U000E0020'.isprintable())
531
def test_surrogates(self):
532
for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
533
'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
534
self.assertTrue(s.islower())
535
self.assertFalse(s.isupper())
536
self.assertFalse(s.istitle())
537
for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
538
'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
539
self.assertFalse(s.islower())
540
self.assertTrue(s.isupper())
541
self.assertTrue(s.istitle())
543
for meth_name in ('islower', 'isupper', 'istitle'):
544
meth = getattr(str, meth_name)
545
for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
546
self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
548
for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
549
'isdecimal', 'isnumeric',
550
'isidentifier', 'isprintable'):
551
meth = getattr(str, meth_name)
552
for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
553
'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
554
'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
555
self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
558
def test_lower(self):
559
string_tests.CommonTest.test_lower(self)
560
self.assertEqual('\U00010427'.lower(), '\U0001044F')
561
self.assertEqual('\U00010427\U00010427'.lower(),
562
'\U0001044F\U0001044F')
563
self.assertEqual('\U00010427\U0001044F'.lower(),
564
'\U0001044F\U0001044F')
565
self.assertEqual('X\U00010427x\U0001044F'.lower(),
566
'x\U0001044Fx\U0001044F')
567
self.assertEqual('ļ¬'.lower(), 'ļ¬')
568
self.assertEqual('\u0130'.lower(), '\u0069\u0307')
569
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
570
self.assertEqual('\u03a3'.lower(), '\u03c3')
571
self.assertEqual('\u0345\u03a3'.lower(), '\u0345\u03c3')
572
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
573
self.assertEqual('A\u0345\u03a3a'.lower(), 'a\u0345\u03c3a')
574
self.assertEqual('A\u0345\u03a3'.lower(), 'a\u0345\u03c2')
575
self.assertEqual('A\u03a3\u0345'.lower(), 'a\u03c2\u0345')
576
self.assertEqual('\u03a3\u0345 '.lower(), '\u03c3\u0345 ')
577
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
578
self.assertEqual('\u2177'.lower(), '\u2177')
580
def test_casefold(self):
581
self.assertEqual('hello'.casefold(), 'hello')
582
self.assertEqual('hELlo'.casefold(), 'hello')
583
self.assertEqual('Ć'.casefold(), 'ss')
584
self.assertEqual('ļ¬'.casefold(), 'fi')
585
self.assertEqual('\u03a3'.casefold(), '\u03c3')
586
self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
587
self.assertEqual('\u00b5'.casefold(), '\u03bc')
589
def test_upper(self):
590
string_tests.CommonTest.test_upper(self)
591
self.assertEqual('\U0001044F'.upper(), '\U00010427')
592
self.assertEqual('\U0001044F\U0001044F'.upper(),
593
'\U00010427\U00010427')
594
self.assertEqual('\U00010427\U0001044F'.upper(),
595
'\U00010427\U00010427')
596
self.assertEqual('X\U00010427x\U0001044F'.upper(),
597
'X\U00010427X\U00010427')
598
self.assertEqual('ļ¬'.upper(), 'FI')
599
self.assertEqual('\u0130'.upper(), '\u0130')
600
self.assertEqual('\u03a3'.upper(), '\u03a3')
601
self.assertEqual('Ć'.upper(), 'SS')
602
self.assertEqual('\u1fd2'.upper(), '\u0399\u0308\u0300')
603
self.assertEqual('\U0008fffe'.upper(), '\U0008fffe')
604
self.assertEqual('\u2177'.upper(), '\u2167')
606
def test_capitalize(self):
607
string_tests.CommonTest.test_capitalize(self)
608
self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
609
self.assertEqual('\U0001044F\U0001044F'.capitalize(),
610
'\U00010427\U0001044F')
611
self.assertEqual('\U00010427\U0001044F'.capitalize(),
612
'\U00010427\U0001044F')
613
self.assertEqual('\U0001044F\U00010427'.capitalize(),
614
'\U00010427\U0001044F')
615
self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
616
'X\U0001044Fx\U0001044F')
617
self.assertEqual('h\u0130'.capitalize(), 'H\u0069\u0307')
618
exp = '\u0399\u0308\u0300\u0069\u0307'
619
self.assertEqual('\u1fd2\u0130'.capitalize(), exp)
620
self.assertEqual('ļ¬nnish'.capitalize(), 'FInnish')
621
self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2')
623
def test_title(self):
624
string_tests.MixinStrUnicodeUserStringTest.test_title(self)
625
self.assertEqual('\U0001044F'.title(), '\U00010427')
626
self.assertEqual('\U0001044F\U0001044F'.title(),
627
'\U00010427\U0001044F')
628
self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
629
'\U00010427\U0001044F \U00010427\U0001044F')
630
self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
631
'\U00010427\U0001044F \U00010427\U0001044F')
632
self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
633
'\U00010427\U0001044F \U00010427\U0001044F')
634
self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
635
'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
636
self.assertEqual('ļ¬NNISH'.title(), 'Finnish')
637
self.assertEqual('A\u03a3 \u1fa1xy'.title(), 'A\u03c2 \u1fa9xy')
638
self.assertEqual('A\u03a3A'.title(), 'A\u03c3a')
640
def test_swapcase(self):
641
string_tests.CommonTest.test_swapcase(self)
642
self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
643
self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
644
self.assertEqual('\U0001044F\U0001044F'.swapcase(),
645
'\U00010427\U00010427')
646
self.assertEqual('\U00010427\U0001044F'.swapcase(),
647
'\U0001044F\U00010427')
648
self.assertEqual('\U0001044F\U00010427'.swapcase(),
649
'\U00010427\U0001044F')
650
self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
651
'x\U0001044FX\U00010427')
652
self.assertEqual('ļ¬'.swapcase(), 'FI')
653
self.assertEqual('\u0130'.swapcase(), '\u0069\u0307')
654
# Special case for GREEK CAPITAL LETTER SIGMA U+03A3
655
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
656
self.assertEqual('\u0345\u03a3'.swapcase(), '\u0399\u03c3')
657
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
658
self.assertEqual('A\u0345\u03a3a'.swapcase(), 'a\u0399\u03c3A')
659
self.assertEqual('A\u0345\u03a3'.swapcase(), 'a\u0399\u03c2')
660
self.assertEqual('A\u03a3\u0345'.swapcase(), 'a\u03c2\u0399')
661
self.assertEqual('\u03a3\u0345 '.swapcase(), '\u03c3\u0399 ')
662
self.assertEqual('\u03a3'.swapcase(), '\u03c3')
663
self.assertEqual('Ć'.swapcase(), 'SS')
664
self.assertEqual('\u1fd2'.swapcase(), '\u0399\u0308\u0300')
666
def test_center(self):
667
string_tests.CommonTest.test_center(self)
668
self.assertEqual('x'.center(2, '\U0010FFFF'),
670
self.assertEqual('x'.center(3, '\U0010FFFF'),
671
'\U0010FFFFx\U0010FFFF')
672
self.assertEqual('x'.center(4, '\U0010FFFF'),
673
'\U0010FFFFx\U0010FFFF\U0010FFFF')
675
def test_contains(self):
676
# Testing Unicode contains method
677
self.assertIn('a', 'abdb')
678
self.assertIn('a', 'bdab')
679
self.assertIn('a', 'bdaba')
680
self.assertIn('a', 'bdba')
681
self.assertNotIn('a', 'bdb')
682
self.assertIn('a', 'bdba')
683
self.assertIn('a', ('a',1,None))
684
self.assertIn('a', (1,None,'a'))
685
self.assertIn('a', ('a',1,None))
686
self.assertIn('a', (1,None,'a'))
687
self.assertNotIn('a', ('x',1,'y'))
688
self.assertNotIn('a', ('x',1,None))
689
self.assertNotIn('abcd', 'abcxxxx')
690
self.assertIn('ab', 'abcd')
691
self.assertIn('ab', 'abc')
692
self.assertIn('ab', (1,None,'ab'))
693
self.assertIn('', 'abc')
694
self.assertIn('', '')
695
self.assertIn('', 'abc')
696
self.assertNotIn('\0', 'abc')
697
self.assertIn('\0', '\0abc')
698
self.assertIn('\0', 'abc\0')
699
self.assertIn('a', '\0abc')
700
self.assertIn('asdf', 'asdf')
701
self.assertNotIn('asdf', 'asd')
702
self.assertNotIn('asdf', '')
704
self.assertRaises(TypeError, "abc".__contains__)
706
def test_issue18183(self):
707
'\U00010000\U00100000'.lower()
708
'\U00010000\U00100000'.casefold()
709
'\U00010000\U00100000'.upper()
710
'\U00010000\U00100000'.capitalize()
711
'\U00010000\U00100000'.title()
712
'\U00010000\U00100000'.swapcase()
713
'\U00100000'.center(3, '\U00010000')
714
'\U00100000'.ljust(3, '\U00010000')
715
'\U00100000'.rjust(3, '\U00010000')
717
def test_format(self):
718
self.assertEqual(''.format(), '')
719
self.assertEqual('a'.format(), 'a')
720
self.assertEqual('ab'.format(), 'ab')
721
self.assertEqual('a{{'.format(), 'a{')
722
self.assertEqual('a}}'.format(), 'a}')
723
self.assertEqual('{{b'.format(), '{b')
724
self.assertEqual('}}b'.format(), '}b')
725
self.assertEqual('a{{b'.format(), 'a{b')
727
# examples from the PEP:
729
self.assertEqual("My name is {0}".format('Fred'), "My name is Fred")
730
self.assertEqual("My name is {0[name]}".format(dict(name='Fred')),
732
self.assertEqual("My name is {0} :-{{}}".format('Fred'),
733
"My name is Fred :-{}")
735
d = datetime.date(2007, 8, 18)
736
self.assertEqual("The year is {0.year}".format(d),
739
# classes we'll use for testing
741
def __init__(self, x=100):
743
def __format__(self, spec):
747
def __init__(self, x):
749
def __format__(self, spec):
752
# class with __str__, but no __format__
754
def __init__(self, x):
757
return 'E(' + self.x + ')'
759
# class with __repr__, but no __format__ or __str__
761
def __init__(self, x):
764
return 'F(' + self.x + ')'
766
# class with __format__ that forwards to string, for some format_spec's
768
def __init__(self, x):
771
return "string is " + self.x
772
def __format__(self, format_spec):
773
if format_spec == 'd':
774
return 'G(' + self.x + ')'
775
return object.__format__(self, format_spec)
777
class I(datetime.date):
778
def __format__(self, format_spec):
779
return self.strftime(format_spec)
782
def __format__(self, format_spec):
783
return int.__format__(self * 2, format_spec)
786
self.assertEqual(''.format(), '')
787
self.assertEqual('abc'.format(), 'abc')
788
self.assertEqual('{0}'.format('abc'), 'abc')
789
self.assertEqual('{0:}'.format('abc'), 'abc')
790
# self.assertEqual('{ 0 }'.format('abc'), 'abc')
791
self.assertEqual('X{0}'.format('abc'), 'Xabc')
792
self.assertEqual('{0}X'.format('abc'), 'abcX')
793
self.assertEqual('X{0}Y'.format('abc'), 'XabcY')
794
self.assertEqual('{1}'.format(1, 'abc'), 'abc')
795
self.assertEqual('X{1}'.format(1, 'abc'), 'Xabc')
796
self.assertEqual('{1}X'.format(1, 'abc'), 'abcX')
797
self.assertEqual('X{1}Y'.format(1, 'abc'), 'XabcY')
798
self.assertEqual('{0}'.format(-15), '-15')
799
self.assertEqual('{0}{1}'.format(-15, 'abc'), '-15abc')
800
self.assertEqual('{0}X{1}'.format(-15, 'abc'), '-15Xabc')
801
self.assertEqual('{{'.format(), '{')
802
self.assertEqual('}}'.format(), '}')
803
self.assertEqual('{{}}'.format(), '{}')
804
self.assertEqual('{{x}}'.format(), '{x}')
805
self.assertEqual('{{{0}}}'.format(123), '{123}')
806
self.assertEqual('{{{{0}}}}'.format(), '{{0}}')
807
self.assertEqual('}}{{'.format(), '}{')
808
self.assertEqual('}}x{{'.format(), '}x{')
811
self.assertEqual("{0[foo-bar]}".format({'foo-bar':'baz'}), 'baz')
812
self.assertEqual("{0[foo bar]}".format({'foo bar':'baz'}), 'baz')
813
self.assertEqual("{0[ ]}".format({' ':3}), '3')
815
self.assertEqual('{foo._x}'.format(foo=C(20)), '20')
816
self.assertEqual('{1}{0}'.format(D(10), D(20)), '2010')
817
self.assertEqual('{0._x.x}'.format(C(D('abc'))), 'abc')
818
self.assertEqual('{0[0]}'.format(['abc', 'def']), 'abc')
819
self.assertEqual('{0[1]}'.format(['abc', 'def']), 'def')
820
self.assertEqual('{0[1][0]}'.format(['abc', ['def']]), 'def')
821
self.assertEqual('{0[1][0].x}'.format(['abc', [D('def')]]), 'def')
824
self.assertEqual('{0:.3s}'.format('abc'), 'abc')
825
self.assertEqual('{0:.3s}'.format('ab'), 'ab')
826
self.assertEqual('{0:.3s}'.format('abcdef'), 'abc')
827
self.assertEqual('{0:.0s}'.format('abcdef'), '')
828
self.assertEqual('{0:3.3s}'.format('abc'), 'abc')
829
self.assertEqual('{0:2.3s}'.format('abc'), 'abc')
830
self.assertEqual('{0:2.2s}'.format('abc'), 'ab')
831
self.assertEqual('{0:3.2s}'.format('abc'), 'ab ')
832
self.assertEqual('{0:x<0s}'.format('result'), 'result')
833
self.assertEqual('{0:x<5s}'.format('result'), 'result')
834
self.assertEqual('{0:x<6s}'.format('result'), 'result')
835
self.assertEqual('{0:x<7s}'.format('result'), 'resultx')
836
self.assertEqual('{0:x<8s}'.format('result'), 'resultxx')
837
self.assertEqual('{0: <7s}'.format('result'), 'result ')
838
self.assertEqual('{0:<7s}'.format('result'), 'result ')
839
self.assertEqual('{0:>7s}'.format('result'), ' result')
840
self.assertEqual('{0:>8s}'.format('result'), ' result')
841
self.assertEqual('{0:^8s}'.format('result'), ' result ')
842
self.assertEqual('{0:^9s}'.format('result'), ' result ')
843
self.assertEqual('{0:^10s}'.format('result'), ' result ')
844
self.assertEqual('{0:10000}'.format('a'), 'a' + ' ' * 9999)
845
self.assertEqual('{0:10000}'.format(''), ' ' * 10000)
846
self.assertEqual('{0:10000000}'.format(''), ' ' * 10000000)
848
# format specifiers for user defined type
849
self.assertEqual('{0:abc}'.format(C()), 'abc')
851
# !r, !s and !a coercions
852
self.assertEqual('{0!s}'.format('Hello'), 'Hello')
853
self.assertEqual('{0!s:}'.format('Hello'), 'Hello')
854
self.assertEqual('{0!s:15}'.format('Hello'), 'Hello ')
855
self.assertEqual('{0!s:15s}'.format('Hello'), 'Hello ')
856
self.assertEqual('{0!r}'.format('Hello'), "'Hello'")
857
self.assertEqual('{0!r:}'.format('Hello'), "'Hello'")
858
self.assertEqual('{0!r}'.format(F('Hello')), 'F(Hello)')
859
self.assertEqual('{0!r}'.format('\u0378'), "'\\u0378'") # nonprintable
860
self.assertEqual('{0!r}'.format('\u0374'), "'\u0374'") # printable
861
self.assertEqual('{0!r}'.format(F('\u0374')), 'F(\u0374)')
862
self.assertEqual('{0!a}'.format('Hello'), "'Hello'")
863
self.assertEqual('{0!a}'.format('\u0378'), "'\\u0378'") # nonprintable
864
self.assertEqual('{0!a}'.format('\u0374'), "'\\u0374'") # printable
865
self.assertEqual('{0!a:}'.format('Hello'), "'Hello'")
866
self.assertEqual('{0!a}'.format(F('Hello')), 'F(Hello)')
867
self.assertEqual('{0!a}'.format(F('\u0374')), 'F(\\u0374)')
869
# test fallback to object.__format__
870
self.assertEqual('{0}'.format({}), '{}')
871
self.assertEqual('{0}'.format([]), '[]')
872
self.assertEqual('{0}'.format([1]), '[1]')
874
self.assertEqual('{0:d}'.format(G('data')), 'G(data)')
875
self.assertEqual('{0!s}'.format(G('data')), 'string is data')
877
self.assertRaises(TypeError, '{0:^10}'.format, E('data'))
878
self.assertRaises(TypeError, '{0:^10s}'.format, E('data'))
879
self.assertRaises(TypeError, '{0:>15s}'.format, G('data'))
881
self.assertEqual("{0:date: %Y-%m-%d}".format(I(year=2007,
886
# test deriving from a builtin type and overriding __format__
887
self.assertEqual("{0}".format(J(10)), "20")
890
# string format specifiers
891
self.assertEqual('{0:}'.format('a'), 'a')
893
# computed format specifiers
894
self.assertEqual("{0:.{1}}".format('hello world', 5), 'hello')
895
self.assertEqual("{0:.{1}s}".format('hello world', 5), 'hello')
896
self.assertEqual("{0:.{precision}s}".format('hello world', precision=5), 'hello')
897
self.assertEqual("{0:{width}.{precision}s}".format('hello world', width=10, precision=5), 'hello ')
898
self.assertEqual("{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), 'hello ')
900
# test various errors
901
self.assertRaises(ValueError, '{'.format)
902
self.assertRaises(ValueError, '}'.format)
903
self.assertRaises(ValueError, 'a{'.format)
904
self.assertRaises(ValueError, 'a}'.format)
905
self.assertRaises(ValueError, '{a'.format)
906
self.assertRaises(ValueError, '}a'.format)
907
self.assertRaises(IndexError, '{0}'.format)
908
self.assertRaises(IndexError, '{1}'.format, 'abc')
909
self.assertRaises(KeyError, '{x}'.format)
910
self.assertRaises(ValueError, "}{".format)
911
self.assertRaises(ValueError, "abc{0:{}".format)
912
self.assertRaises(ValueError, "{0".format)
913
self.assertRaises(IndexError, "{0.}".format)
914
self.assertRaises(ValueError, "{0.}".format, 0)
915
self.assertRaises(ValueError, "{0[}".format)
916
self.assertRaises(ValueError, "{0[}".format, [])
917
self.assertRaises(KeyError, "{0]}".format)
918
self.assertRaises(ValueError, "{0.[]}".format, 0)
919
self.assertRaises(ValueError, "{0..foo}".format, 0)
920
self.assertRaises(ValueError, "{0[0}".format, 0)
921
self.assertRaises(ValueError, "{0[0:foo}".format, 0)
922
self.assertRaises(KeyError, "{c]}".format)
923
self.assertRaises(ValueError, "{{ {{{0}}".format, 0)
924
self.assertRaises(ValueError, "{0}}".format, 0)
925
self.assertRaises(KeyError, "{foo}".format, bar=3)
926
self.assertRaises(ValueError, "{0!x}".format, 3)
927
self.assertRaises(ValueError, "{0!}".format, 0)
928
self.assertRaises(ValueError, "{0!rs}".format, 0)
929
self.assertRaises(ValueError, "{!}".format)
930
self.assertRaises(IndexError, "{:}".format)
931
self.assertRaises(IndexError, "{:s}".format)
932
self.assertRaises(IndexError, "{}".format)
933
big = "23098475029384702983476098230754973209482573"
934
self.assertRaises(ValueError, ("{" + big + "}").format)
935
self.assertRaises(ValueError, ("{[" + big + "]}").format, [0])
938
self.assertRaises(ValueError, "{0[0]x}".format, [None])
939
self.assertRaises(ValueError, "{0[0](10)}".format, [None])
941
# can't have a replacement on the field name portion
942
self.assertRaises(TypeError, '{0[{1}]}'.format, 'abcdefg', 4)
944
# exceed maximum recursion depth
945
self.assertRaises(ValueError, "{0:{1:{2}}}".format, 'abc', 's', '')
946
self.assertRaises(ValueError, "{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
947
0, 1, 2, 3, 4, 5, 6, 7)
949
# string format spec errors
950
self.assertRaises(ValueError, "{0:-s}".format, '')
951
self.assertRaises(ValueError, format, "", "-")
952
self.assertRaises(ValueError, "{0:=s}".format, '')
954
# Alternate formatting is not supported
955
self.assertRaises(ValueError, format, '', '#')
956
self.assertRaises(ValueError, format, '', '#20')
959
self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
960
'ABC\u0410\u0411\u0412')
961
self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
963
self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
966
self.assertEqual("{[{}]}".format({"{}": 5}), "5")
967
self.assertEqual("{[{}]}".format({"{}" : "a"}), "a")
968
self.assertEqual("{[{]}".format({"{" : "a"}), "a")
969
self.assertEqual("{[}]}".format({"}" : "a"}), "a")
970
self.assertEqual("{[[]}".format({"[" : "a"}), "a")
971
self.assertEqual("{[!]}".format({"!" : "a"}), "a")
972
self.assertRaises(ValueError, "{a{}b}".format, 42)
973
self.assertRaises(ValueError, "{a{b}".format, 42)
974
self.assertRaises(ValueError, "{[}".format, 42)
976
def test_format_map(self):
977
self.assertEqual(''.format_map({}), '')
978
self.assertEqual('a'.format_map({}), 'a')
979
self.assertEqual('ab'.format_map({}), 'ab')
980
self.assertEqual('a{{'.format_map({}), 'a{')
981
self.assertEqual('a}}'.format_map({}), 'a}')
982
self.assertEqual('{{b'.format_map({}), '{b')
983
self.assertEqual('}}b'.format_map({}), '}b')
984
self.assertEqual('a{{b'.format_map({}), 'a{b')
988
def __missing__(self, key):
990
self.assertEqual('{hello}'.format_map(Mapping()), 'hello')
991
self.assertEqual('{a} {world}'.format_map(Mapping(a='hello')), 'hello world')
993
class InternalMapping:
995
self.mapping = {'a': 'hello'}
996
def __getitem__(self, key):
997
return self.mapping[key]
998
self.assertEqual('{a}'.format_map(InternalMapping()), 'hello')
1002
def __init__(self, x=100):
1004
def __format__(self, spec):
1006
self.assertEqual('{foo._x}'.format_map({'foo': C(20)}), '20')
1008
# test various errors
1009
self.assertRaises(TypeError, ''.format_map)
1010
self.assertRaises(TypeError, 'a'.format_map)
1012
self.assertRaises(ValueError, '{'.format_map, {})
1013
self.assertRaises(ValueError, '}'.format_map, {})
1014
self.assertRaises(ValueError, 'a{'.format_map, {})
1015
self.assertRaises(ValueError, 'a}'.format_map, {})
1016
self.assertRaises(ValueError, '{a'.format_map, {})
1017
self.assertRaises(ValueError, '}a'.format_map, {})
1019
# issue #12579: can't supply positional params to format_map
1020
self.assertRaises(ValueError, '{}'.format_map, {'a' : 2})
1021
self.assertRaises(ValueError, '{}'.format_map, 'a')
1022
self.assertRaises(ValueError, '{a} {}'.format_map, {"a" : 2, "b" : 1})
1024
def test_format_huge_precision(self):
1025
format_string = ".{}f".format(sys.maxsize + 1)
1026
with self.assertRaises(ValueError):
1027
result = format(2.34, format_string)
1029
def test_format_huge_width(self):
1030
format_string = "{}f".format(sys.maxsize + 1)
1031
with self.assertRaises(ValueError):
1032
result = format(2.34, format_string)
1034
def test_format_huge_item_number(self):
1035
format_string = "{{{}:.6f}}".format(sys.maxsize + 1)
1036
with self.assertRaises(ValueError):
1037
result = format_string.format(2.34)
1039
def test_format_auto_numbering(self):
1041
def __init__(self, x=100):
1043
def __format__(self, spec):
1046
self.assertEqual('{}'.format(10), '10')
1047
self.assertEqual('{:5}'.format('s'), 's ')
1048
self.assertEqual('{!r}'.format('s'), "'s'")
1049
self.assertEqual('{._x}'.format(C(10)), '10')
1050
self.assertEqual('{[1]}'.format([1, 2]), '2')
1051
self.assertEqual('{[a]}'.format({'a':4, 'b':2}), '4')
1052
self.assertEqual('a{}b{}c'.format(0, 1), 'a0b1c')
1054
self.assertEqual('a{:{}}b'.format('x', '^10'), 'a x b')
1055
self.assertEqual('a{:{}x}b'.format(20, '#'), 'a0x14b')
1057
# can't mix and match numbering and auto-numbering
1058
self.assertRaises(ValueError, '{}{1}'.format, 1, 2)
1059
self.assertRaises(ValueError, '{1}{}'.format, 1, 2)
1060
self.assertRaises(ValueError, '{:{1}}'.format, 1, 2)
1061
self.assertRaises(ValueError, '{0:{}}'.format, 1, 2)
1063
# can mix and match auto-numbering and named
1064
self.assertEqual('{f}{}'.format(4, f='test'), 'test4')
1065
self.assertEqual('{}{f}'.format(4, f='test'), '4test')
1066
self.assertEqual('{:{f}}{g}{}'.format(1, 3, g='g', f=2), ' 1g3')
1067
self.assertEqual('{f:{}}{}{g}'.format(2, 4, f=1, g='g'), ' 14g')
1069
def test_formatting(self):
1070
string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
1071
# Testing Unicode formatting strings...
1072
self.assertEqual("%s, %s" % ("abc", "abc"), 'abc, abc')
1073
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, 2, 3), 'abc, abc, 1, 2.000000, 3.00')
1074
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", 1, -2, 3), 'abc, abc, 1, -2.000000, 3.00')
1075
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.5), 'abc, abc, -1, -2.000000, 3.50')
1076
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 3.57), 'abc, abc, -1, -2.000000, 3.57')
1077
self.assertEqual("%s, %s, %i, %f, %5.2f" % ("abc", "abc", -1, -2, 1003.57), 'abc, abc, -1, -2.000000, 1003.57')
1078
if not sys.platform.startswith('java'):
1079
self.assertEqual("%r, %r" % (b"abc", "abc"), "b'abc', 'abc'")
1080
self.assertEqual("%r" % ("\u1234",), "'\u1234'")
1081
self.assertEqual("%a" % ("\u1234",), "'\\u1234'")
1082
self.assertEqual("%(x)s, %(y)s" % {'x':"abc", 'y':"def"}, 'abc, def')
1083
self.assertEqual("%(x)s, %(\xfc)s" % {'x':"abc", '\xfc':"def"}, 'abc, def')
1085
self.assertEqual('%c' % 0x1234, '\u1234')
1086
self.assertEqual('%c' % 0x21483, '\U00021483')
1087
self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
1088
self.assertEqual('%c' % '\U00021483', '\U00021483')
1089
self.assertRaises(TypeError, "%c".__mod__, "aa")
1090
self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
1091
self.assertRaises(TypeError, "%i".__mod__, "aa")
1093
# formatting jobs delegated from the string implementation:
1094
self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1095
self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1096
self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1097
self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
1098
self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1099
self.assertEqual('...%(foo)s...' % {'foo':"abc",'def':123}, '...abc...')
1100
self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,"abc"), '...1...2...3...abc...')
1101
self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,"abc"), '...%...%s...1...2...3...abc...')
1102
self.assertEqual('...%s...' % "abc", '...abc...')
1103
self.assertEqual('%*s' % (5,'abc',), ' abc')
1104
self.assertEqual('%*s' % (-5,'abc',), 'abc ')
1105
self.assertEqual('%*.*s' % (5,2,'abc',), ' ab')
1106
self.assertEqual('%*.*s' % (5,3,'abc',), ' abc')
1107
self.assertEqual('%i %*.*s' % (10, 5,3,'abc',), '10 abc')
1108
self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, 'abc',), '103 abc')
1109
self.assertEqual('%c' % 'a', 'a')
1113
self.assertEqual('%s' % Wrapper(), '\u1234')
1118
self.assertEqual('%f' % NAN, 'nan')
1119
self.assertEqual('%F' % NAN, 'NAN')
1120
self.assertEqual('%f' % INF, 'inf')
1121
self.assertEqual('%F' % INF, 'INF')
1124
self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
1125
self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
1127
def test_formatting_with_enum(self):
1130
class Float(float, enum.Enum):
1132
class Int(enum.IntEnum):
1134
class Str(str, enum.Enum):
1136
# Testing Unicode formatting strings...
1137
self.assertEqual("%s, %s" % (Str.ABC, Str.ABC),
1139
self.assertEqual("%s, %s, %d, %i, %u, %f, %5.2f" %
1141
Int.IDES, Int.IDES, Int.IDES,
1142
Float.PI, Float.PI),
1143
'Str.ABC, Str.ABC, 15, 15, 15, 3.141593, 3.14')
1145
# formatting jobs delegated from the string implementation:
1146
self.assertEqual('...%(foo)s...' % {'foo':Str.ABC},
1148
self.assertEqual('...%(foo)s...' % {'foo':Int.IDES},
1150
self.assertEqual('...%(foo)i...' % {'foo':Int.IDES},
1152
self.assertEqual('...%(foo)d...' % {'foo':Int.IDES},
1154
self.assertEqual('...%(foo)u...' % {'foo':Int.IDES, 'def':Float.PI},
1156
self.assertEqual('...%(foo)f...' % {'foo':Float.PI,'def':123},
1159
@support.cpython_only
1160
def test_formatting_huge_precision(self):
1161
from _testcapi import INT_MAX
1162
format_string = "%.{}f".format(INT_MAX + 1)
1163
with self.assertRaises(ValueError):
1164
result = format_string % 2.34
1166
def test_formatting_huge_width(self):
1167
format_string = "%{}f".format(sys.maxsize + 1)
1168
with self.assertRaises(ValueError):
1169
result = format_string % 2.34
1171
def test_startswith_endswith_errors(self):
1172
for meth in ('foo'.startswith, 'foo'.endswith):
1173
with self.assertRaises(TypeError) as cm:
1175
exc = str(cm.exception)
1176
self.assertIn('str', exc)
1177
self.assertIn('tuple', exc)
1179
@support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
1180
def test_format_float(self):
1181
# should not format with a comma, but always with C locale
1182
self.assertEqual('1.0', '%.1f' % 1.0)
1184
def test_constructor(self):
1185
# unicode(obj) tests (this maps to PyObject_Unicode() at C level)
1188
str('unicode remains unicode'),
1189
'unicode remains unicode'
1192
class UnicodeSubclass(str):
1195
for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
1196
subclass = UnicodeSubclass(text)
1197
self.assertEqual(str(subclass), text)
1198
self.assertEqual(len(subclass), len(text))
1200
self.assertEqual(subclass.encode('ascii'), b'ascii')
1201
self.assertEqual(subclass.encode('utf-8'), b'ascii')
1204
str('strings are converted to unicode'),
1205
'strings are converted to unicode'
1209
def __init__(self, x):
1215
str(StringCompat('__str__ compatible objects are recognized')),
1216
'__str__ compatible objects are recognized'
1219
# unicode(obj) is compatible to str():
1221
o = StringCompat('unicode(obj) is compatible to str()')
1222
self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1223
self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
1225
for obj in (123, 123.45, 123):
1226
self.assertEqual(str(obj), str(str(obj)))
1228
# unicode(obj, encoding, error) tests (this maps to
1229
# PyUnicode_FromEncodedObject() at C level)
1231
if not sys.platform.startswith('java'):
1235
'decoding unicode is not supported',
1241
str(b'strings are decoded to unicode', 'utf-8', 'strict'),
1242
'strings are decoded to unicode'
1245
if not sys.platform.startswith('java'):
1248
memoryview(b'character buffers are decoded to unicode'),
1252
'character buffers are decoded to unicode'
1255
self.assertRaises(TypeError, str, 42, 42, 42)
1257
def test_constructor_keyword_args(self):
1258
"""Pass various keyword argument combinations to the constructor."""
1259
# The object argument can be passed as a keyword.
1260
self.assertEqual(str(object='foo'), 'foo')
1261
self.assertEqual(str(object=b'foo', encoding='utf-8'), 'foo')
1262
# The errors argument without encoding triggers "decode" mode.
1263
self.assertEqual(str(b'foo', errors='strict'), 'foo') # not "b'foo'"
1264
self.assertEqual(str(object=b'foo', errors='strict'), 'foo')
1266
def test_constructor_defaults(self):
1267
"""Check the constructor argument defaults."""
1268
# The object argument defaults to '' or b''.
1269
self.assertEqual(str(), '')
1270
self.assertEqual(str(errors='strict'), '')
1271
utf8_cent = 'Ā¢'.encode('utf-8')
1272
# The encoding argument defaults to utf-8.
1273
self.assertEqual(str(utf8_cent, errors='strict'), 'Ā¢')
1274
# The errors argument defaults to strict.
1275
self.assertRaises(UnicodeDecodeError, str, utf8_cent, encoding='ascii')
1277
def test_codecs_utf7(self):
1279
('A\u2262\u0391.', b'A+ImIDkQ.'), # RFC2152 example
1280
('Hi Mom -\u263a-!', b'Hi Mom -+Jjo--!'), # RFC2152 example
1281
('\u65E5\u672C\u8A9E', b'+ZeVnLIqe-'), # RFC2152 example
1282
('Item 3 is \u00a31.', b'Item 3 is +AKM-1.'), # RFC2152 example
1288
(r'\\?', b'+AFwAXA?'),
1289
(r'\\\?', b'+AFwAXABc?'),
1290
(r'++--', b'+-+---'),
1291
('\U000abcde', b'+2m/c3g-'), # surrogate pairs
1295
for (x, y) in utfTests:
1296
self.assertEqual(x.encode('utf-7'), y)
1298
# Unpaired surrogates are passed through
1299
self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
1300
self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
1301
self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
1302
self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
1303
self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
1304
self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
1305
self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
1306
self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
1308
self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
1309
self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
1311
# Issue #2242: crash on some Windows/MSVC versions
1312
self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
1314
# Direct encoded characters
1315
set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
1316
# Optional direct characters
1317
set_o = '!"#$%&*;<=>@[]^_`{|}'
1319
self.assertEqual(c.encode('utf7'), c.encode('ascii'))
1320
self.assertEqual(c.encode('ascii').decode('utf7'), c)
1322
self.assertEqual(c.encode('ascii').decode('utf7'), c)
1324
def test_codecs_utf8(self):
1325
self.assertEqual(''.encode('utf-8'), b'')
1326
self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac')
1327
self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82')
1328
self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96')
1329
self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80')
1330
self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80')
1331
self.assertEqual(('\U00010002'*10).encode('utf-8'),
1332
b'\xf0\x90\x80\x82'*10)
1334
'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
1335
'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
1336
'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
1337
'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
1338
'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
1339
' Nunstuck git und'.encode('utf-8'),
1340
b'\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
1341
b'\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
1342
b'\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
1343
b'\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
1344
b'\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
1345
b'\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
1346
b'\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
1347
b'\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
1348
b'\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
1349
b'\xe3\x80\x8cWenn ist das Nunstuck git und'
1352
# UTF-8 specific decoding tests
1353
self.assertEqual(str(b'\xf0\xa3\x91\x96', 'utf-8'), '\U00023456' )
1354
self.assertEqual(str(b'\xf0\x90\x80\x82', 'utf-8'), '\U00010002' )
1355
self.assertEqual(str(b'\xe2\x82\xac', 'utf-8'), '\u20ac' )
1357
# Other possible utf-8 test cases:
1358
# * strict decoding testing for all of the
1359
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
1361
def test_utf8_decode_valid_sequences(self):
1364
(b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
1366
(b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
1368
(b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
1369
(b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
1371
(b'\xF0\x90\x80\x80', '\U00010000'),
1372
(b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
1374
for seq, res in sequences:
1375
self.assertEqual(seq.decode('utf-8'), res)
1378
def test_utf8_decode_invalid_sequences(self):
1379
# continuation bytes in a sequence of 2, 3, or 4 bytes
1380
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
1381
# start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
1382
invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
1383
# start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
1384
invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
1385
invalid_start_bytes = (
1386
continuation_bytes + invalid_2B_seq_start_bytes +
1387
invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
1390
for byte in invalid_start_bytes:
1391
self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
1393
for sb in invalid_2B_seq_start_bytes:
1394
for cb in continuation_bytes:
1395
self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
1397
for sb in invalid_4B_seq_start_bytes:
1398
for cb1 in continuation_bytes[:3]:
1399
for cb3 in continuation_bytes[:3]:
1400
self.assertRaises(UnicodeDecodeError,
1401
(sb+cb1+b'\x80'+cb3).decode, 'utf-8')
1403
for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
1404
self.assertRaises(UnicodeDecodeError,
1405
(b'\xE0'+cb+b'\x80').decode, 'utf-8')
1406
self.assertRaises(UnicodeDecodeError,
1407
(b'\xE0'+cb+b'\xBF').decode, 'utf-8')
1409
for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1410
self.assertRaises(UnicodeDecodeError,
1411
(b'\xED'+cb+b'\x80').decode, 'utf-8')
1412
self.assertRaises(UnicodeDecodeError,
1413
(b'\xED'+cb+b'\xBF').decode, 'utf-8')
1414
for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1415
self.assertRaises(UnicodeDecodeError,
1416
(b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1417
self.assertRaises(UnicodeDecodeError,
1418
(b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1419
for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1420
self.assertRaises(UnicodeDecodeError,
1421
(b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1422
self.assertRaises(UnicodeDecodeError,
1423
(b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1425
def test_issue8271(self):
1426
# Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1427
# only the start byte and the continuation byte(s) are now considered
1428
# invalid, instead of the number of bytes specified by the start byte.
1429
# See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1430
# table 3-8, Row 2) for more information about the algorithm used.
1433
# invalid start bytes
1434
(b'\x80', FFFD), # continuation byte
1435
(b'\x80\x80', FFFD*2), # 2 continuation bytes
1437
(b'\xc0\xc0', FFFD*2),
1439
(b'\xc1\xc0', FFFD*2),
1440
(b'\xc0\xc1', FFFD*2),
1441
# with start byte of a 2-byte sequence
1442
(b'\xc2', FFFD), # only the start byte
1443
(b'\xc2\xc2', FFFD*2), # 2 start bytes
1444
(b'\xc2\xc2\xc2', FFFD*3), # 3 start bytes
1445
(b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1446
# with start byte of a 3-byte sequence
1447
(b'\xe1', FFFD), # only the start byte
1448
(b'\xe1\xe1', FFFD*2), # 2 start bytes
1449
(b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1450
(b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1451
(b'\xe1\x80', FFFD), # only 1 continuation byte
1452
(b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1453
(b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1454
(b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1455
(b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1456
(b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1457
(b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1458
# with start byte of a 4-byte sequence
1459
(b'\xf1', FFFD), # only the start byte
1460
(b'\xf1\xf1', FFFD*2), # 2 start bytes
1461
(b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1462
(b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1463
(b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1464
(b'\xf1\x80', FFFD), # only 1 continuation bytes
1465
(b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1466
(b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1467
(b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1468
(b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1469
(b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1470
(b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1471
(b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1472
(b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1473
(b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1474
(b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1475
(b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1476
(b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1477
# with invalid start byte of a 4-byte sequence (rfc2279)
1478
(b'\xf5', FFFD), # only the start byte
1479
(b'\xf5\xf5', FFFD*2), # 2 start bytes
1480
(b'\xf5\x80', FFFD*2), # only 1 continuation byte
1481
(b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1482
(b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1483
(b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1484
(b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1485
(b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1486
# with invalid start byte of a 5-byte sequence (rfc2279)
1487
(b'\xf8', FFFD), # only the start byte
1488
(b'\xf8\xf8', FFFD*2), # 2 start bytes
1489
(b'\xf8\x80', FFFD*2), # only one continuation byte
1490
(b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1491
(b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1492
# with invalid start byte of a 6-byte sequence (rfc2279)
1493
(b'\xfc', FFFD), # only the start byte
1494
(b'\xfc\xfc', FFFD*2), # 2 start bytes
1495
(b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1496
(b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1497
# invalid start byte
1499
(b'\xfe\x80\x80', FFFD*3),
1501
(b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1502
(b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1503
(b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1504
(b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1505
'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1507
for n, (seq, res) in enumerate(sequences):
1508
self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1509
self.assertEqual(seq.decode('utf-8', 'replace'), res)
1510
self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1511
self.assertEqual(seq.decode('utf-8', 'ignore'),
1512
res.replace('\uFFFD', ''))
1514
def to_bytestring(self, seq):
1515
return bytes(int(c, 16) for c in seq.split())
1517
def assertCorrectUTF8Decoding(self, seq, res, err):
1519
Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when
1520
'strict' is used, returns res when 'replace' is used, and that doesn't
1521
return anything when 'ignore' is used.
1523
with self.assertRaises(UnicodeDecodeError) as cm:
1527
self.assertIn(err, str(exc))
1528
self.assertEqual(seq.decode('utf-8', 'replace'), res)
1529
self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'replace'),
1530
'aaaa' + res + 'bbbb')
1531
res = res.replace('\ufffd', '')
1532
self.assertEqual(seq.decode('utf-8', 'ignore'), res)
1533
self.assertEqual((b'aaaa' + seq + b'bbbb').decode('utf-8', 'ignore'),
1534
'aaaa' + res + 'bbbb')
1536
def test_invalid_start_byte(self):
1538
Test that an 'invalid start byte' error is raised when the first byte
1539
is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1540
4-bytes sequence. The invalid start byte is replaced with a single
1541
U+FFFD when errors='replace'.
1542
E.g. <80> is a continuation byte and can appear only after a start byte.
1545
for byte in b'\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF':
1546
self.assertCorrectUTF8Decoding(bytes([byte]), '\ufffd',
1547
'invalid start byte')
1549
def test_unexpected_end_of_data(self):
1551
Test that an 'unexpected end of data' error is raised when the string
1552
ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1553
enough continuation bytes. The incomplete sequence is replaced with a
1554
single U+FFFD when errors='replace'.
1555
E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1556
sequence, but it's followed by only 2 valid continuation bytes and the
1557
last continuation bytes is missing.
1558
Note: the continuation bytes must be all valid, if one of them is
1559
invalid another error will be raised.
1563
'E0 A0', 'E0 BF', 'E1 80', 'E1 BF', 'EC 80', 'EC BF',
1564
'ED 80', 'ED 9F', 'EE 80', 'EE BF', 'EF 80', 'EF BF',
1565
'F0 90', 'F0 BF', 'F0 90 80', 'F0 90 BF', 'F0 BF 80', 'F0 BF BF',
1566
'F1 80', 'F1 BF', 'F1 80 80', 'F1 80 BF', 'F1 BF 80', 'F1 BF BF',
1567
'F3 80', 'F3 BF', 'F3 80 80', 'F3 80 BF', 'F3 BF 80', 'F3 BF BF',
1568
'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF'
1571
for seq in sequences:
1572
self.assertCorrectUTF8Decoding(self.to_bytestring(seq), '\ufffd',
1573
'unexpected end of data')
1575
def test_invalid_cb_for_2bytes_seq(self):
1577
Test that an 'invalid continuation byte' error is raised when the
1578
continuation byte of a 2-bytes sequence is invalid. The start byte
1579
is replaced by a single U+FFFD and the second byte is handled
1580
separately when errors='replace'.
1581
E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1582
sequence, but 41 is not a valid continuation byte because it's the
1588
('C2 00', FFFD+'\x00'), ('C2 7F', FFFD+'\x7f'),
1589
('C2 C0', FFFDx2), ('C2 FF', FFFDx2),
1590
('DF 00', FFFD+'\x00'), ('DF 7F', FFFD+'\x7f'),
1591
('DF C0', FFFDx2), ('DF FF', FFFDx2),
1593
for seq, res in sequences:
1594
self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1595
'invalid continuation byte')
1597
def test_invalid_cb_for_3bytes_seq(self):
1599
Test that an 'invalid continuation byte' error is raised when the
1600
continuation byte(s) of a 3-bytes sequence are invalid. When
1601
errors='replace', if the first continuation byte is valid, the first
1602
two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1603
third byte is handled separately, otherwise only the start byte is
1604
replaced with a U+FFFD and the other continuation bytes are handled
1606
E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1607
sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1608
because it's the ASCII letter 'A'.
1609
Note: when the start byte is E0 or ED, the valid ranges for the first
1610
continuation byte are limited to A0..BF and 80..9F respectively.
1611
Python 2 used to consider all the bytes in range 80..BF valid when the
1612
start byte was ED. This is fixed in Python 3.
1617
('E0 00', FFFD+'\x00'), ('E0 7F', FFFD+'\x7f'), ('E0 80', FFFDx2),
1618
('E0 9F', FFFDx2), ('E0 C0', FFFDx2), ('E0 FF', FFFDx2),
1619
('E0 A0 00', FFFD+'\x00'), ('E0 A0 7F', FFFD+'\x7f'),
1620
('E0 A0 C0', FFFDx2), ('E0 A0 FF', FFFDx2),
1621
('E0 BF 00', FFFD+'\x00'), ('E0 BF 7F', FFFD+'\x7f'),
1622
('E0 BF C0', FFFDx2), ('E0 BF FF', FFFDx2), ('E1 00', FFFD+'\x00'),
1623
('E1 7F', FFFD+'\x7f'), ('E1 C0', FFFDx2), ('E1 FF', FFFDx2),
1624
('E1 80 00', FFFD+'\x00'), ('E1 80 7F', FFFD+'\x7f'),
1625
('E1 80 C0', FFFDx2), ('E1 80 FF', FFFDx2),
1626
('E1 BF 00', FFFD+'\x00'), ('E1 BF 7F', FFFD+'\x7f'),
1627
('E1 BF C0', FFFDx2), ('E1 BF FF', FFFDx2), ('EC 00', FFFD+'\x00'),
1628
('EC 7F', FFFD+'\x7f'), ('EC C0', FFFDx2), ('EC FF', FFFDx2),
1629
('EC 80 00', FFFD+'\x00'), ('EC 80 7F', FFFD+'\x7f'),
1630
('EC 80 C0', FFFDx2), ('EC 80 FF', FFFDx2),
1631
('EC BF 00', FFFD+'\x00'), ('EC BF 7F', FFFD+'\x7f'),
1632
('EC BF C0', FFFDx2), ('EC BF FF', FFFDx2), ('ED 00', FFFD+'\x00'),
1633
('ED 7F', FFFD+'\x7f'),
1634
('ED A0', FFFDx2), ('ED BF', FFFDx2), # see note ^
1635
('ED C0', FFFDx2), ('ED FF', FFFDx2), ('ED 80 00', FFFD+'\x00'),
1636
('ED 80 7F', FFFD+'\x7f'), ('ED 80 C0', FFFDx2),
1637
('ED 80 FF', FFFDx2), ('ED 9F 00', FFFD+'\x00'),
1638
('ED 9F 7F', FFFD+'\x7f'), ('ED 9F C0', FFFDx2),
1639
('ED 9F FF', FFFDx2), ('EE 00', FFFD+'\x00'),
1640
('EE 7F', FFFD+'\x7f'), ('EE C0', FFFDx2), ('EE FF', FFFDx2),
1641
('EE 80 00', FFFD+'\x00'), ('EE 80 7F', FFFD+'\x7f'),
1642
('EE 80 C0', FFFDx2), ('EE 80 FF', FFFDx2),
1643
('EE BF 00', FFFD+'\x00'), ('EE BF 7F', FFFD+'\x7f'),
1644
('EE BF C0', FFFDx2), ('EE BF FF', FFFDx2), ('EF 00', FFFD+'\x00'),
1645
('EF 7F', FFFD+'\x7f'), ('EF C0', FFFDx2), ('EF FF', FFFDx2),
1646
('EF 80 00', FFFD+'\x00'), ('EF 80 7F', FFFD+'\x7f'),
1647
('EF 80 C0', FFFDx2), ('EF 80 FF', FFFDx2),
1648
('EF BF 00', FFFD+'\x00'), ('EF BF 7F', FFFD+'\x7f'),
1649
('EF BF C0', FFFDx2), ('EF BF FF', FFFDx2),
1651
for seq, res in sequences:
1652
self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1653
'invalid continuation byte')
1655
def test_invalid_cb_for_4bytes_seq(self):
1657
Test that an 'invalid continuation byte' error is raised when the
1658
continuation byte(s) of a 4-bytes sequence are invalid. When
1659
errors='replace',the start byte and all the following valid
1660
continuation bytes are replaced with a single U+FFFD, and all the bytes
1661
starting from the first invalid continuation bytes (included) are
1663
E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1664
sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1665
because it's the ASCII letter 'A'.
1666
Note: when the start byte is E0 or ED, the valid ranges for the first
1667
continuation byte are limited to A0..BF and 80..9F respectively.
1668
However, when the start byte is ED, Python 2 considers all the bytes
1669
in range 80..BF valid. This is fixed in Python 3.
1674
('F0 00', FFFD+'\x00'), ('F0 7F', FFFD+'\x7f'), ('F0 80', FFFDx2),
1675
('F0 8F', FFFDx2), ('F0 C0', FFFDx2), ('F0 FF', FFFDx2),
1676
('F0 90 00', FFFD+'\x00'), ('F0 90 7F', FFFD+'\x7f'),
1677
('F0 90 C0', FFFDx2), ('F0 90 FF', FFFDx2),
1678
('F0 BF 00', FFFD+'\x00'), ('F0 BF 7F', FFFD+'\x7f'),
1679
('F0 BF C0', FFFDx2), ('F0 BF FF', FFFDx2),
1680
('F0 90 80 00', FFFD+'\x00'), ('F0 90 80 7F', FFFD+'\x7f'),
1681
('F0 90 80 C0', FFFDx2), ('F0 90 80 FF', FFFDx2),
1682
('F0 90 BF 00', FFFD+'\x00'), ('F0 90 BF 7F', FFFD+'\x7f'),
1683
('F0 90 BF C0', FFFDx2), ('F0 90 BF FF', FFFDx2),
1684
('F0 BF 80 00', FFFD+'\x00'), ('F0 BF 80 7F', FFFD+'\x7f'),
1685
('F0 BF 80 C0', FFFDx2), ('F0 BF 80 FF', FFFDx2),
1686
('F0 BF BF 00', FFFD+'\x00'), ('F0 BF BF 7F', FFFD+'\x7f'),
1687
('F0 BF BF C0', FFFDx2), ('F0 BF BF FF', FFFDx2),
1688
('F1 00', FFFD+'\x00'), ('F1 7F', FFFD+'\x7f'), ('F1 C0', FFFDx2),
1689
('F1 FF', FFFDx2), ('F1 80 00', FFFD+'\x00'),
1690
('F1 80 7F', FFFD+'\x7f'), ('F1 80 C0', FFFDx2),
1691
('F1 80 FF', FFFDx2), ('F1 BF 00', FFFD+'\x00'),
1692
('F1 BF 7F', FFFD+'\x7f'), ('F1 BF C0', FFFDx2),
1693
('F1 BF FF', FFFDx2), ('F1 80 80 00', FFFD+'\x00'),
1694
('F1 80 80 7F', FFFD+'\x7f'), ('F1 80 80 C0', FFFDx2),
1695
('F1 80 80 FF', FFFDx2), ('F1 80 BF 00', FFFD+'\x00'),
1696
('F1 80 BF 7F', FFFD+'\x7f'), ('F1 80 BF C0', FFFDx2),
1697
('F1 80 BF FF', FFFDx2), ('F1 BF 80 00', FFFD+'\x00'),
1698
('F1 BF 80 7F', FFFD+'\x7f'), ('F1 BF 80 C0', FFFDx2),
1699
('F1 BF 80 FF', FFFDx2), ('F1 BF BF 00', FFFD+'\x00'),
1700
('F1 BF BF 7F', FFFD+'\x7f'), ('F1 BF BF C0', FFFDx2),
1701
('F1 BF BF FF', FFFDx2), ('F3 00', FFFD+'\x00'),
1702
('F3 7F', FFFD+'\x7f'), ('F3 C0', FFFDx2), ('F3 FF', FFFDx2),
1703
('F3 80 00', FFFD+'\x00'), ('F3 80 7F', FFFD+'\x7f'),
1704
('F3 80 C0', FFFDx2), ('F3 80 FF', FFFDx2),
1705
('F3 BF 00', FFFD+'\x00'), ('F3 BF 7F', FFFD+'\x7f'),
1706
('F3 BF C0', FFFDx2), ('F3 BF FF', FFFDx2),
1707
('F3 80 80 00', FFFD+'\x00'), ('F3 80 80 7F', FFFD+'\x7f'),
1708
('F3 80 80 C0', FFFDx2), ('F3 80 80 FF', FFFDx2),
1709
('F3 80 BF 00', FFFD+'\x00'), ('F3 80 BF 7F', FFFD+'\x7f'),
1710
('F3 80 BF C0', FFFDx2), ('F3 80 BF FF', FFFDx2),
1711
('F3 BF 80 00', FFFD+'\x00'), ('F3 BF 80 7F', FFFD+'\x7f'),
1712
('F3 BF 80 C0', FFFDx2), ('F3 BF 80 FF', FFFDx2),
1713
('F3 BF BF 00', FFFD+'\x00'), ('F3 BF BF 7F', FFFD+'\x7f'),
1714
('F3 BF BF C0', FFFDx2), ('F3 BF BF FF', FFFDx2),
1715
('F4 00', FFFD+'\x00'), ('F4 7F', FFFD+'\x7f'), ('F4 90', FFFDx2),
1716
('F4 BF', FFFDx2), ('F4 C0', FFFDx2), ('F4 FF', FFFDx2),
1717
('F4 80 00', FFFD+'\x00'), ('F4 80 7F', FFFD+'\x7f'),
1718
('F4 80 C0', FFFDx2), ('F4 80 FF', FFFDx2),
1719
('F4 8F 00', FFFD+'\x00'), ('F4 8F 7F', FFFD+'\x7f'),
1720
('F4 8F C0', FFFDx2), ('F4 8F FF', FFFDx2),
1721
('F4 80 80 00', FFFD+'\x00'), ('F4 80 80 7F', FFFD+'\x7f'),
1722
('F4 80 80 C0', FFFDx2), ('F4 80 80 FF', FFFDx2),
1723
('F4 80 BF 00', FFFD+'\x00'), ('F4 80 BF 7F', FFFD+'\x7f'),
1724
('F4 80 BF C0', FFFDx2), ('F4 80 BF FF', FFFDx2),
1725
('F4 8F 80 00', FFFD+'\x00'), ('F4 8F 80 7F', FFFD+'\x7f'),
1726
('F4 8F 80 C0', FFFDx2), ('F4 8F 80 FF', FFFDx2),
1727
('F4 8F BF 00', FFFD+'\x00'), ('F4 8F BF 7F', FFFD+'\x7f'),
1728
('F4 8F BF C0', FFFDx2), ('F4 8F BF FF', FFFDx2)
1730
for seq, res in sequences:
1731
self.assertCorrectUTF8Decoding(self.to_bytestring(seq), res,
1732
'invalid continuation byte')
1734
def test_codecs_idna(self):
1735
# Test whether trailing dot is preserved
1736
self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")
1738
def test_codecs_errors(self):
1739
# Error handling (encoding)
1740
self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii')
1741
self.assertRaises(UnicodeError, 'Andr\202 x'.encode, 'ascii','strict')
1742
self.assertEqual('Andr\202 x'.encode('ascii','ignore'), b"Andr x")
1743
self.assertEqual('Andr\202 x'.encode('ascii','replace'), b"Andr? x")
1744
self.assertEqual('Andr\202 x'.encode('ascii', 'replace'),
1745
'Andr\202 x'.encode('ascii', errors='replace'))
1746
self.assertEqual('Andr\202 x'.encode('ascii', 'ignore'),
1747
'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
1749
# Error handling (decoding)
1750
self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii')
1751
self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
1752
self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
1753
self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
1755
# Error handling (unknown character names)
1756
self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
1758
# Error handling (truncated escape sequence)
1759
self.assertRaises(UnicodeError, b"\\".decode, "unicode-escape")
1761
self.assertRaises(TypeError, b"hello".decode, "test.unicode1")
1762
self.assertRaises(TypeError, str, b"hello", "test.unicode2")
1763
self.assertRaises(TypeError, "hello".encode, "test.unicode1")
1764
self.assertRaises(TypeError, "hello".encode, "test.unicode2")
1766
# Error handling (wrong arguments)
1767
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
1769
# Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
1770
self.assertRaises(UnicodeError, float, "\ud800")
1771
self.assertRaises(UnicodeError, float, "\udf00")
1772
self.assertRaises(UnicodeError, complex, "\ud800")
1773
self.assertRaises(UnicodeError, complex, "\udf00")
1775
def test_codecs(self):
1777
self.assertEqual('hello'.encode('ascii'), b'hello')
1778
self.assertEqual('hello'.encode('utf-7'), b'hello')
1779
self.assertEqual('hello'.encode('utf-8'), b'hello')
1780
self.assertEqual('hello'.encode('utf-8'), b'hello')
1781
self.assertEqual('hello'.encode('utf-16-le'), b'h\000e\000l\000l\000o\000')
1782
self.assertEqual('hello'.encode('utf-16-be'), b'\000h\000e\000l\000l\000o')
1783
self.assertEqual('hello'.encode('latin-1'), b'hello')
1785
# Default encoding is utf-8
1786
self.assertEqual('\u2603'.encode(), b'\xe2\x98\x83')
1788
# Roundtrip safety for BMP (just the first 1024 chars)
1789
for c in range(1024):
1791
for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1792
'utf-16-be', 'raw_unicode_escape',
1793
'unicode_escape', 'unicode_internal'):
1794
with warnings.catch_warnings():
1795
# unicode-internal has been deprecated
1796
warnings.simplefilter("ignore", DeprecationWarning)
1798
self.assertEqual(str(u.encode(encoding),encoding), u)
1800
# Roundtrip safety for BMP (just the first 256 chars)
1801
for c in range(256):
1803
for encoding in ('latin-1',):
1804
self.assertEqual(str(u.encode(encoding),encoding), u)
1806
# Roundtrip safety for BMP (just the first 128 chars)
1807
for c in range(128):
1809
for encoding in ('ascii',):
1810
self.assertEqual(str(u.encode(encoding),encoding), u)
1812
# Roundtrip safety for non-BMP (just a few chars)
1813
with warnings.catch_warnings():
1814
# unicode-internal has been deprecated
1815
warnings.simplefilter("ignore", DeprecationWarning)
1817
u = '\U00010001\U00020002\U00030003\U00040004\U00050005'
1818
for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1819
'raw_unicode_escape',
1820
'unicode_escape', 'unicode_internal'):
1821
self.assertEqual(str(u.encode(encoding),encoding), u)
1823
# UTF-8 must be roundtrip safe for all code points
1824
# (except surrogates, which are forbidden).
1825
u = ''.join(map(chr, list(range(0, 0xd800)) +
1826
list(range(0xe000, 0x110000))))
1827
for encoding in ('utf-8',):
1828
self.assertEqual(str(u.encode(encoding),encoding), u)
1830
def test_codecs_charmap(self):
1832
s = bytes(range(128))
1834
'cp037', 'cp1026', 'cp273',
1835
'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1836
'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1837
'cp863', 'cp865', 'cp866', 'cp1125',
1838
'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1839
'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1840
'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1841
'mac_cyrillic', 'mac_latin2',
1843
'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1844
'cp1256', 'cp1257', 'cp1258',
1845
'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1847
'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1848
'cp1006', 'iso8859_8',
1850
### These have undefined mappings:
1853
### These fail the round-trip:
1857
self.assertEqual(str(s, encoding).encode(encoding), s)
1860
s = bytes(range(128, 256))
1862
'cp037', 'cp1026', 'cp273',
1863
'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1864
'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1865
'cp863', 'cp865', 'cp866', 'cp1125',
1866
'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1867
'iso8859_2', 'iso8859_4', 'iso8859_5',
1868
'iso8859_9', 'koi8_r', 'latin_1',
1869
'mac_cyrillic', 'mac_latin2',
1871
### These have undefined mappings:
1872
#'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1873
#'cp1256', 'cp1257', 'cp1258',
1874
#'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1875
#'iso8859_3', 'iso8859_6', 'iso8859_7',
1876
#'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1878
### These fail the round-trip:
1879
#'cp1006', 'cp875', 'iso8859_8',
1882
self.assertEqual(str(s, encoding).encode(encoding), s)
1884
def test_concatenation(self):
1885
self.assertEqual(("abc" "def"), "abcdef")
1886
self.assertEqual(("abc" "def"), "abcdef")
1887
self.assertEqual(("abc" "def"), "abcdef")
1888
self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
1889
self.assertEqual(("abc" "def" "ghi"), "abcdefghi")
1891
def test_printing(self):
1893
def write(self, text):
1897
print('abc', file=out)
1898
print('abc', 'def', file=out)
1899
print('abc', 'def', file=out)
1900
print('abc', 'def', file=out)
1901
print('abc\n', file=out)
1902
print('abc\n', end=' ', file=out)
1903
print('abc\n', end=' ', file=out)
1904
print('def\n', file=out)
1905
print('def\n', file=out)
1907
def test_ucs4(self):
1909
y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1910
self.assertEqual(x, y)
1913
x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1914
self.assertEqual(x, y)
1916
x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1917
self.assertEqual(x, y)
1920
br'\U11111111'.decode("raw-unicode-escape")
1921
except UnicodeDecodeError as e:
1922
self.assertEqual(e.start, 0)
1923
self.assertEqual(e.end, 10)
1925
self.fail("Should have raised UnicodeDecodeError")
1927
def test_conversion(self):
1928
# Make sure __unicode__() works properly
1967
def __new__(cls, content=""):
1968
return str.__new__(cls, 2*content)
1974
return "not unicode"
1976
self.assertEqual(str(Foo0()), "foo")
1977
self.assertEqual(str(Foo1()), "foo")
1978
self.assertEqual(str(Foo2()), "foo")
1979
self.assertEqual(str(Foo3()), "foo")
1980
self.assertEqual(str(Foo4("bar")), "foo")
1981
self.assertEqual(str(Foo5("bar")), "foo")
1982
self.assertEqual(str(Foo6("bar")), "foou")
1983
self.assertEqual(str(Foo7("bar")), "foou")
1984
self.assertEqual(str(Foo8("foo")), "foofoo")
1985
self.assertEqual(str(Foo9("foo")), "not unicode")
1987
def test_unicode_repr(self):
1996
self.assertEqual(repr(s1()), '\\n')
1997
self.assertEqual(repr(s2()), '\\n')
1999
def test_printable_repr(self):
2000
self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
2001
self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
2003
def test_expandtabs_overflows_gracefully(self):
2004
# This test only affects 32-bit platforms because expandtabs can only take
2005
# an int as the max value, not a 64-bit C long. If expandtabs is changed
2006
# to take a 64-bit long, this test should apply to all platforms.
2007
if sys.maxsize > (1 << 32) or struct.calcsize('P') != 4:
2009
self.assertRaises(OverflowError, 't\tt\t'.expandtabs, sys.maxsize)
2011
@support.cpython_only
2012
def test_expandtabs_optimization(self):
2014
self.assertIs(s.expandtabs(), s)
2016
def test_raiseMemError(self):
2017
if struct.calcsize('P') == 8:
2019
ascii_struct_size = 48
2020
compact_struct_size = 72
2023
ascii_struct_size = 24
2024
compact_struct_size = 36
2026
for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
2029
char_size = 1 # sizeof(Py_UCS1)
2030
struct_size = ascii_struct_size
2031
elif code < 0x10000:
2032
char_size = 2 # sizeof(Py_UCS2)
2033
struct_size = compact_struct_size
2035
char_size = 4 # sizeof(Py_UCS4)
2036
struct_size = compact_struct_size
2037
# Note: sys.maxsize is half of the actual max allocation because of
2038
# the signedness of Py_ssize_t. Strings of maxlen-1 should in principle
2039
# be allocatable, given enough memory.
2040
maxlen = ((sys.maxsize - struct_size) // char_size)
2041
alloc = lambda: char * maxlen
2042
self.assertRaises(MemoryError, alloc)
2043
self.assertRaises(MemoryError, alloc)
2045
def test_format_subclass(self):
2048
return '__str__ overridden'
2050
self.assertEqual("%s" % s, '__str__ overridden')
2051
self.assertEqual("{}".format(s), '__str__ overridden')
2053
# Test PyUnicode_FromFormat()
2054
def test_from_format(self):
2055
support.import_module('ctypes')
2056
from ctypes import (
2057
pythonapi, py_object, sizeof,
2058
c_int, c_long, c_longlong, c_ssize_t,
2059
c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
2060
name = "PyUnicode_FromFormat"
2061
_PyUnicode_FromFormat = getattr(pythonapi, name)
2062
_PyUnicode_FromFormat.restype = py_object
2064
def PyUnicode_FromFormat(format, *args):
2066
py_object(arg) if isinstance(arg, str) else arg
2068
return _PyUnicode_FromFormat(format, *cargs)
2070
def check_format(expected, format, *args):
2071
text = PyUnicode_FromFormat(format, *args)
2072
self.assertEqual(expected, text)
2074
# ascii format, non-ascii argument
2075
check_format('ascii\x7f=unicode\xe9',
2076
b'ascii\x7f=%U', 'unicode\xe9')
2078
# non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
2080
self.assertRaisesRegex(ValueError,
2081
'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
2082
'string, got a non-ASCII byte: 0xe9$',
2083
PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
2086
check_format('\uabcd',
2087
b'%c', c_int(0xabcd))
2088
check_format('\U0010ffff',
2089
b'%c', c_int(0x10ffff))
2090
with self.assertRaises(OverflowError):
2091
PyUnicode_FromFormat(b'%c', c_int(0x110000))
2093
check_format('\U00010000\U00100000',
2094
b'%c%c', c_int(0x10000), c_int(0x100000))
2105
check_format('%abc',
2111
check_format('abc[\ufffd',
2112
b'%.5s', 'abc[\u20ac]'.encode('utf8'))
2113
check_format("'\\u20acABC'",
2115
check_format("'\\u20",
2116
b'%.5A', '\u20acABCDEF')
2117
check_format("'\u20acABC'",
2119
check_format("'\u20acA",
2120
b'%.3R', '\u20acABCDEF')
2121
check_format('\u20acAB',
2122
b'%.3S', '\u20acABCDEF')
2123
check_format('\u20acAB',
2124
b'%.3U', '\u20acABCDEF')
2125
check_format('\u20acAB',
2126
b'%.3V', '\u20acABCDEF', None)
2127
check_format('abc[\ufffd',
2128
b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
2130
# following tests comes from #7330
2131
# test width modifier and precision modifier with %S
2132
check_format("repr= abc",
2134
check_format("repr=ab",
2135
b'repr=%.2S', 'abc')
2136
check_format("repr= ab",
2137
b'repr=%5.2S', 'abc')
2139
# test width modifier and precision modifier with %R
2140
check_format("repr= 'abc'",
2142
check_format("repr='ab",
2143
b'repr=%.3R', 'abc')
2144
check_format("repr= 'ab",
2145
b'repr=%5.3R', 'abc')
2147
# test width modifier and precision modifier with %A
2148
check_format("repr= 'abc'",
2150
check_format("repr='ab",
2151
b'repr=%.3A', 'abc')
2152
check_format("repr= 'ab",
2153
b'repr=%5.3A', 'abc')
2155
# test width modifier and precision modifier with %s
2156
check_format("repr= abc",
2157
b'repr=%5s', b'abc')
2158
check_format("repr=ab",
2159
b'repr=%.2s', b'abc')
2160
check_format("repr= ab",
2161
b'repr=%5.2s', b'abc')
2163
# test width modifier and precision modifier with %U
2164
check_format("repr= abc",
2166
check_format("repr=ab",
2167
b'repr=%.2U', 'abc')
2168
check_format("repr= ab",
2169
b'repr=%5.2U', 'abc')
2171
# test width modifier and precision modifier with %V
2172
check_format("repr= abc",
2173
b'repr=%5V', 'abc', b'123')
2174
check_format("repr=ab",
2175
b'repr=%.2V', 'abc', b'123')
2176
check_format("repr= ab",
2177
b'repr=%5.2V', 'abc', b'123')
2178
check_format("repr= 123",
2179
b'repr=%5V', None, b'123')
2180
check_format("repr=12",
2181
b'repr=%.2V', None, b'123')
2182
check_format("repr= 12",
2183
b'repr=%5.2V', None, b'123')
2185
# test integer formats (%i, %d, %u)
2188
check_format('0010',
2189
b'%0.4i', c_int(10))
2190
check_format('-123',
2192
check_format('-123',
2193
b'%li', c_long(-123))
2194
check_format('-123',
2195
b'%lli', c_longlong(-123))
2196
check_format('-123',
2197
b'%zi', c_ssize_t(-123))
2199
check_format('-123',
2201
check_format('-123',
2202
b'%ld', c_long(-123))
2203
check_format('-123',
2204
b'%lld', c_longlong(-123))
2205
check_format('-123',
2206
b'%zd', c_ssize_t(-123))
2211
b'%lu', c_ulong(123))
2213
b'%llu', c_ulonglong(123))
2215
b'%zu', c_size_t(123))
2218
min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
2219
max_longlong = -min_longlong - 1
2220
check_format(str(min_longlong),
2221
b'%lld', c_longlong(min_longlong))
2222
check_format(str(max_longlong),
2223
b'%lld', c_longlong(max_longlong))
2224
max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
2225
check_format(str(max_ulonglong),
2226
b'%llu', c_ulonglong(max_ulonglong))
2227
PyUnicode_FromFormat(b'%p', c_void_p(-1))
2229
# test padding (width and/or precision)
2230
check_format('123'.rjust(10, '0'),
2231
b'%010i', c_int(123))
2232
check_format('123'.rjust(100),
2233
b'%100i', c_int(123))
2234
check_format('123'.rjust(100, '0'),
2235
b'%.100i', c_int(123))
2236
check_format('123'.rjust(80, '0').rjust(100),
2237
b'%100.80i', c_int(123))
2239
check_format('123'.rjust(10, '0'),
2240
b'%010u', c_uint(123))
2241
check_format('123'.rjust(100),
2242
b'%100u', c_uint(123))
2243
check_format('123'.rjust(100, '0'),
2244
b'%.100u', c_uint(123))
2245
check_format('123'.rjust(80, '0').rjust(100),
2246
b'%100.80u', c_uint(123))
2248
check_format('123'.rjust(10, '0'),
2249
b'%010x', c_int(0x123))
2250
check_format('123'.rjust(100),
2251
b'%100x', c_int(0x123))
2252
check_format('123'.rjust(100, '0'),
2253
b'%.100x', c_int(0x123))
2254
check_format('123'.rjust(80, '0').rjust(100),
2255
b'%100.80x', c_int(0x123))
2258
check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
2259
b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
2262
check_format('repr=abc',
2263
b'repr=%V', 'abc', b'xyz')
2265
# Test string decode from parameter of %s using utf-8.
2266
# b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
2268
check_format('repr=\u4eba\u6c11',
2269
b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
2271
#Test replace error handler.
2272
check_format('repr=abc\ufffd',
2273
b'repr=%V', None, b'abc\xff')
2275
# not supported: copy the raw format string. these tests are just here
2276
# to check for crashs and should not be considered as specifications
2279
check_format('%1abc',
2283
check_format('%.%s',
2286
# Test PyUnicode_AsWideChar()
2287
def test_aswidechar(self):
2288
from _testcapi import unicode_aswidechar
2289
support.import_module('ctypes')
2290
from ctypes import c_wchar, sizeof
2292
wchar, size = unicode_aswidechar('abcdef', 2)
2293
self.assertEqual(size, 2)
2294
self.assertEqual(wchar, 'ab')
2296
wchar, size = unicode_aswidechar('abc', 3)
2297
self.assertEqual(size, 3)
2298
self.assertEqual(wchar, 'abc')
2300
wchar, size = unicode_aswidechar('abc', 4)
2301
self.assertEqual(size, 3)
2302
self.assertEqual(wchar, 'abc\0')
2304
wchar, size = unicode_aswidechar('abc', 10)
2305
self.assertEqual(size, 3)
2306
self.assertEqual(wchar, 'abc\0')
2308
wchar, size = unicode_aswidechar('abc\0def', 20)
2309
self.assertEqual(size, 7)
2310
self.assertEqual(wchar, 'abc\0def\0')
2312
nonbmp = chr(0x10ffff)
2313
if sizeof(c_wchar) == 2:
2316
else: # sizeof(c_wchar) == 4
2319
wchar, size = unicode_aswidechar(nonbmp, buflen)
2320
self.assertEqual(size, nchar)
2321
self.assertEqual(wchar, nonbmp + '\0')
2323
# Test PyUnicode_AsWideCharString()
2324
def test_aswidecharstring(self):
2325
from _testcapi import unicode_aswidecharstring
2326
support.import_module('ctypes')
2327
from ctypes import c_wchar, sizeof
2329
wchar, size = unicode_aswidecharstring('abc')
2330
self.assertEqual(size, 3)
2331
self.assertEqual(wchar, 'abc\0')
2333
wchar, size = unicode_aswidecharstring('abc\0def')
2334
self.assertEqual(size, 7)
2335
self.assertEqual(wchar, 'abc\0def\0')
2337
nonbmp = chr(0x10ffff)
2338
if sizeof(c_wchar) == 2:
2340
else: # sizeof(c_wchar) == 4
2342
wchar, size = unicode_aswidecharstring(nonbmp)
2343
self.assertEqual(size, nchar)
2344
self.assertEqual(wchar, nonbmp + '\0')
2346
def test_subclass_add(self):
2348
def __add__(self, o):
2350
self.assertEqual(S("4") + S("5"), "3")
2352
def __iadd__(self, o):
2356
self.assertEqual(s, "3")
2358
def test_encode_decimal(self):
2359
from _testcapi import unicode_encodedecimal
2360
self.assertEqual(unicode_encodedecimal('123'),
2362
self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
2364
self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2366
self.assertRaises(UnicodeEncodeError,
2367
unicode_encodedecimal, "123\u20ac", "strict")
2368
self.assertRaisesRegex(
2370
"^'decimal' codec can't encode character",
2371
unicode_encodedecimal, "123\u20ac", "replace")
2373
def test_transform_decimal(self):
2374
from _testcapi import unicode_transformdecimaltoascii as transform_decimal
2375
self.assertEqual(transform_decimal('123'),
2377
self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
2379
self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
2380
"\N{EM SPACE}3.14\N{EN SPACE}")
2381
self.assertEqual(transform_decimal('123\u20ac'),
2384
def test_getnewargs(self):
2386
args = text.__getnewargs__()
2387
self.assertIsNot(args[0], text)
2388
self.assertEqual(args[0], text)
2389
self.assertEqual(len(args), 1)
2391
def test_resize(self):
2392
for length in range(1, 100, 7):
2393
# generate a fresh string (refcount=1)
2394
text = 'a' * length + 'b'
2396
with support.check_warnings(('unicode_internal codec has been '
2397
'deprecated', DeprecationWarning)):
2398
# fill wstr internal field
2399
abc = text.encode('unicode_internal')
2400
self.assertEqual(abc.decode('unicode_internal'), text)
2402
# resize text: wstr field must be cleared and then recomputed
2404
abcdef = text.encode('unicode_internal')
2405
self.assertNotEqual(abc, abcdef)
2406
self.assertEqual(abcdef.decode('unicode_internal'), text)
2408
def test_compare(self):
2417
astral = '\U00100000' * N
2418
astral2 = '\U0010ffff' * N
2424
for text1, text2 in itertools.combinations(strings, 2):
2425
equal = (text1 is text2)
2426
self.assertEqual(text1 == text2, equal)
2427
self.assertEqual(text1 != text2, not equal)
2430
self.assertTrue(text1 <= text2)
2431
self.assertTrue(text1 >= text2)
2433
# text1 is text2: duplicate strings to skip the "str1 == str2"
2434
# optimization in unicode_compare_eq() and really compare
2435
# character per character
2436
copy1 = duplicate_string(text1)
2437
copy2 = duplicate_string(text2)
2438
self.assertIsNot(copy1, copy2)
2440
self.assertTrue(copy1 == copy2)
2441
self.assertFalse(copy1 != copy2)
2443
self.assertTrue(copy1 <= copy2)
2444
self.assertTrue(copy2 >= copy2)
2446
self.assertTrue(ascii < ascii2)
2447
self.assertTrue(ascii < latin)
2448
self.assertTrue(ascii < bmp)
2449
self.assertTrue(ascii < astral)
2450
self.assertFalse(ascii >= ascii2)
2451
self.assertFalse(ascii >= latin)
2452
self.assertFalse(ascii >= bmp)
2453
self.assertFalse(ascii >= astral)
2455
self.assertFalse(latin < ascii)
2456
self.assertTrue(latin < latin2)
2457
self.assertTrue(latin < bmp)
2458
self.assertTrue(latin < astral)
2459
self.assertTrue(latin >= ascii)
2460
self.assertFalse(latin >= latin2)
2461
self.assertFalse(latin >= bmp)
2462
self.assertFalse(latin >= astral)
2464
self.assertFalse(bmp < ascii)
2465
self.assertFalse(bmp < latin)
2466
self.assertTrue(bmp < bmp2)
2467
self.assertTrue(bmp < astral)
2468
self.assertTrue(bmp >= ascii)
2469
self.assertTrue(bmp >= latin)
2470
self.assertFalse(bmp >= bmp2)
2471
self.assertFalse(bmp >= astral)
2473
self.assertFalse(astral < ascii)
2474
self.assertFalse(astral < latin)
2475
self.assertFalse(astral < bmp2)
2476
self.assertTrue(astral < astral2)
2477
self.assertTrue(astral >= ascii)
2478
self.assertTrue(astral >= latin)
2479
self.assertTrue(astral >= bmp2)
2480
self.assertFalse(astral >= astral2)
2483
class StringModuleTest(unittest.TestCase):
2484
def test_formatter_parser(self):
2486
return list(_string.formatter_parser(format))
2488
formatter = parse("prefix {2!s}xxx{0:^+10.3f}{obj.attr!s} {z[0]!s:10}")
2489
self.assertEqual(formatter, [
2490
('prefix ', '2', '', 's'),
2491
('xxx', '0', '^+10.3f', None),
2492
('', 'obj.attr', '', 's'),
2493
(' ', 'z[0]', '10', 's'),
2496
formatter = parse("prefix {} suffix")
2497
self.assertEqual(formatter, [
2498
('prefix ', '', '', None),
2499
(' suffix', None, None, None),
2502
formatter = parse("str")
2503
self.assertEqual(formatter, [
2504
('str', None, None, None),
2507
formatter = parse("")
2508
self.assertEqual(formatter, [])
2510
formatter = parse("{0}")
2511
self.assertEqual(formatter, [
2512
('', '0', '', None),
2515
self.assertRaises(TypeError, _string.formatter_parser, 1)
2517
def test_formatter_field_name_split(self):
2519
items = list(_string.formatter_field_name_split(name))
2520
items[1] = list(items[1])
2522
self.assertEqual(split("obj"), ["obj", []])
2523
self.assertEqual(split("obj.arg"), ["obj", [(True, 'arg')]])
2524
self.assertEqual(split("obj[key]"), ["obj", [(False, 'key')]])
2525
self.assertEqual(split("obj.arg[key1][key2]"), [
2531
self.assertRaises(TypeError, _string.formatter_field_name_split, 1)
2534
if __name__ == "__main__":