945
948
# * strict decoding testing for all of the
946
949
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
951
def test_utf8_decode_valid_sequences(self):
954
(b'\x00', '\x00'), (b'a', 'a'), (b'\x7f', '\x7f'),
956
(b'\xc2\x80', '\x80'), (b'\xdf\xbf', '\u07ff'),
958
(b'\xe0\xa0\x80', '\u0800'), (b'\xed\x9f\xbf', '\ud7ff'),
959
(b'\xee\x80\x80', '\uE000'), (b'\xef\xbf\xbf', '\uffff'),
961
(b'\xF0\x90\x80\x80', '\U00010000'),
962
(b'\xf4\x8f\xbf\xbf', '\U0010FFFF')
964
for seq, res in sequences:
965
self.assertEqual(seq.decode('utf-8'), res)
968
def test_utf8_decode_invalid_sequences(self):
969
# continuation bytes in a sequence of 2, 3, or 4 bytes
970
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
971
# start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
972
invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
973
# start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
974
invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
975
invalid_start_bytes = (
976
continuation_bytes + invalid_2B_seq_start_bytes +
977
invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
980
for byte in invalid_start_bytes:
981
self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
983
for sb in invalid_2B_seq_start_bytes:
984
for cb in continuation_bytes:
985
self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
987
for sb in invalid_4B_seq_start_bytes:
988
for cb1 in continuation_bytes[:3]:
989
for cb3 in continuation_bytes[:3]:
990
self.assertRaises(UnicodeDecodeError,
991
(sb+cb1+b'\x80'+cb3).decode, 'utf-8')
993
for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
994
self.assertRaises(UnicodeDecodeError,
995
(b'\xE0'+cb+b'\x80').decode, 'utf-8')
996
self.assertRaises(UnicodeDecodeError,
997
(b'\xE0'+cb+b'\xBF').decode, 'utf-8')
999
for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
1000
self.assertRaises(UnicodeDecodeError,
1001
(b'\xED'+cb+b'\x80').decode, 'utf-8')
1002
self.assertRaises(UnicodeDecodeError,
1003
(b'\xED'+cb+b'\xBF').decode, 'utf-8')
1004
for cb in [bytes([x]) for x in range(0x80, 0x90)]:
1005
self.assertRaises(UnicodeDecodeError,
1006
(b'\xF0'+cb+b'\x80\x80').decode, 'utf-8')
1007
self.assertRaises(UnicodeDecodeError,
1008
(b'\xF0'+cb+b'\xBF\xBF').decode, 'utf-8')
1009
for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
1010
self.assertRaises(UnicodeDecodeError,
1011
(b'\xF4'+cb+b'\x80\x80').decode, 'utf-8')
1012
self.assertRaises(UnicodeDecodeError,
1013
(b'\xF4'+cb+b'\xBF\xBF').decode, 'utf-8')
1015
def test_issue8271(self):
1016
# Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
1017
# only the start byte and the continuation byte(s) are now considered
1018
# invalid, instead of the number of bytes specified by the start byte.
1019
# See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
1020
# table 3-8, Row 2) for more information about the algorithm used.
1023
# invalid start bytes
1024
(b'\x80', FFFD), # continuation byte
1025
(b'\x80\x80', FFFD*2), # 2 continuation bytes
1027
(b'\xc0\xc0', FFFD*2),
1029
(b'\xc1\xc0', FFFD*2),
1030
(b'\xc0\xc1', FFFD*2),
1031
# with start byte of a 2-byte sequence
1032
(b'\xc2', FFFD), # only the start byte
1033
(b'\xc2\xc2', FFFD*2), # 2 start bytes
1034
(b'\xc2\xc2\xc2', FFFD*3), # 2 start bytes
1035
(b'\xc2\x41', FFFD+'A'), # invalid continuation byte
1036
# with start byte of a 3-byte sequence
1037
(b'\xe1', FFFD), # only the start byte
1038
(b'\xe1\xe1', FFFD*2), # 2 start bytes
1039
(b'\xe1\xe1\xe1', FFFD*3), # 3 start bytes
1040
(b'\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
1041
(b'\xe1\x80', FFFD), # only 1 continuation byte
1042
(b'\xe1\x41', FFFD+'A'), # invalid continuation byte
1043
(b'\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
1044
(b'\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
1045
(b'\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
1046
(b'\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
1047
(b'\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
1048
# with start byte of a 4-byte sequence
1049
(b'\xf1', FFFD), # only the start byte
1050
(b'\xf1\xf1', FFFD*2), # 2 start bytes
1051
(b'\xf1\xf1\xf1', FFFD*3), # 3 start bytes
1052
(b'\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
1053
(b'\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
1054
(b'\xf1\x80', FFFD), # only 1 continuation bytes
1055
(b'\xf1\x80\x80', FFFD), # only 2 continuation bytes
1056
(b'\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
1057
(b'\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
1058
(b'\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
1059
(b'\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
1060
(b'\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
1061
(b'\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
1062
(b'\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
1063
(b'\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
1064
(b'\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
1065
(b'\xf1\xf1\x80\x41', FFFD*2+'A'),
1066
(b'\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
1067
# with invalid start byte of a 4-byte sequence (rfc2279)
1068
(b'\xf5', FFFD), # only the start byte
1069
(b'\xf5\xf5', FFFD*2), # 2 start bytes
1070
(b'\xf5\x80', FFFD*2), # only 1 continuation byte
1071
(b'\xf5\x80\x80', FFFD*3), # only 2 continuation byte
1072
(b'\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
1073
(b'\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
1074
(b'\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
1075
(b'\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
1076
# with invalid start byte of a 5-byte sequence (rfc2279)
1077
(b'\xf8', FFFD), # only the start byte
1078
(b'\xf8\xf8', FFFD*2), # 2 start bytes
1079
(b'\xf8\x80', FFFD*2), # only one continuation byte
1080
(b'\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
1081
(b'\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
1082
# with invalid start byte of a 6-byte sequence (rfc2279)
1083
(b'\xfc', FFFD), # only the start byte
1084
(b'\xfc\xfc', FFFD*2), # 2 start bytes
1085
(b'\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1086
(b'\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1087
# invalid start byte
1089
(b'\xfe\x80\x80', FFFD*3),
1091
(b'\xf1\x80\x41\x42\x43', '\ufffd\x41\x42\x43'),
1092
(b'\xf1\x80\xff\x42\x43', '\ufffd\ufffd\x42\x43'),
1093
(b'\xf1\x80\xc2\x81\x43', '\ufffd\x81\x43'),
1094
(b'\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1095
'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1097
for n, (seq, res) in enumerate(sequences):
1098
self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1099
self.assertEqual(seq.decode('utf-8', 'replace'), res)
1100
self.assertEqual((seq+b'b').decode('utf-8', 'replace'), res+'b')
1101
self.assertEqual(seq.decode('utf-8', 'ignore'),
1102
res.replace('\uFFFD', ''))
948
1104
def test_codecs_idna(self):
949
1105
# Test whether trailing dot is preserved
950
1106
self.assertEqual("www.python.org.".encode("idna"), b"www.python.org.")