587
587
# * strict decoding testing for all of the
588
588
# UTF8_ERROR cases in PyUnicode_DecodeUTF8
590
def test_utf8_decode_valid_sequences(self):
593
('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
595
('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
597
('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
598
('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
600
('\xF0\x90\x80\x80', u'\U00010000'),
601
('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
603
for seq, res in sequences:
604
self.assertEqual(seq.decode('utf-8'), res)
606
for ch in map(unichr, range(0, sys.maxunicode)):
607
self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
609
def test_utf8_decode_invalid_sequences(self):
610
# continuation bytes in a sequence of 2, 3, or 4 bytes
611
continuation_bytes = map(chr, range(0x80, 0xC0))
612
# start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
613
invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
614
# start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
615
invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
616
invalid_start_bytes = (
617
continuation_bytes + invalid_2B_seq_start_bytes +
618
invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
621
for byte in invalid_start_bytes:
622
self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
624
for sb in invalid_2B_seq_start_bytes:
625
for cb in continuation_bytes:
626
self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
628
for sb in invalid_4B_seq_start_bytes:
629
for cb1 in continuation_bytes[:3]:
630
for cb3 in continuation_bytes[:3]:
631
self.assertRaises(UnicodeDecodeError,
632
(sb+cb1+'\x80'+cb3).decode, 'utf-8')
634
for cb in map(chr, range(0x80, 0xA0)):
635
self.assertRaises(UnicodeDecodeError,
636
('\xE0'+cb+'\x80').decode, 'utf-8')
637
self.assertRaises(UnicodeDecodeError,
638
('\xE0'+cb+'\xBF').decode, 'utf-8')
639
# XXX: surrogates shouldn't be valid UTF-8!
640
# see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
641
# (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
642
#for cb in map(chr, range(0xA0, 0xC0)):
643
#sys.__stdout__.write('\\xED\\x%02x\\x80\n' % ord(cb))
644
#self.assertRaises(UnicodeDecodeError,
645
#('\xED'+cb+'\x80').decode, 'utf-8')
646
#self.assertRaises(UnicodeDecodeError,
647
#('\xED'+cb+'\xBF').decode, 'utf-8')
648
for cb in map(chr, range(0x80, 0x90)):
649
self.assertRaises(UnicodeDecodeError,
650
('\xF0'+cb+'\x80\x80').decode, 'utf-8')
651
self.assertRaises(UnicodeDecodeError,
652
('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
653
for cb in map(chr, range(0x90, 0xC0)):
654
self.assertRaises(UnicodeDecodeError,
655
('\xF4'+cb+'\x80\x80').decode, 'utf-8')
656
self.assertRaises(UnicodeDecodeError,
657
('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
659
def test_issue8271(self):
660
# Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
661
# only the start byte and the continuation byte(s) are now considered
662
# invalid, instead of the number of bytes specified by the start byte.
663
# See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
664
# table 3-8, Row 2) for more information about the algorithm used.
667
# invalid start bytes
668
('\x80', FFFD), # continuation byte
669
('\x80\x80', FFFD*2), # 2 continuation bytes
671
('\xc0\xc0', FFFD*2),
673
('\xc1\xc0', FFFD*2),
674
('\xc0\xc1', FFFD*2),
675
# with start byte of a 2-byte sequence
676
('\xc2', FFFD), # only the start byte
677
('\xc2\xc2', FFFD*2), # 2 start bytes
678
('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
679
('\xc2\x41', FFFD+'A'), # invalid continuation byte
680
# with start byte of a 3-byte sequence
681
('\xe1', FFFD), # only the start byte
682
('\xe1\xe1', FFFD*2), # 2 start bytes
683
('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
684
('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
685
('\xe1\x80', FFFD), # only 1 continuation byte
686
('\xe1\x41', FFFD+'A'), # invalid continuation byte
687
('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
688
('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
689
('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
690
('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
691
('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
692
# with start byte of a 4-byte sequence
693
('\xf1', FFFD), # only the start byte
694
('\xf1\xf1', FFFD*2), # 2 start bytes
695
('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
696
('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
697
('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
698
('\xf1\x80', FFFD), # only 1 continuation bytes
699
('\xf1\x80\x80', FFFD), # only 2 continuation bytes
700
('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
701
('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
702
('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
703
('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
704
('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
705
('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
706
('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
707
('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
708
('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
709
('\xf1\xf1\x80\x41', FFFD*2+'A'),
710
('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
711
# with invalid start byte of a 4-byte sequence (rfc2279)
712
('\xf5', FFFD), # only the start byte
713
('\xf5\xf5', FFFD*2), # 2 start bytes
714
('\xf5\x80', FFFD*2), # only 1 continuation byte
715
('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
716
('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
717
('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid
718
('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
719
('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
720
# with invalid start byte of a 5-byte sequence (rfc2279)
721
('\xf8', FFFD), # only the start byte
722
('\xf8\xf8', FFFD*2), # 2 start bytes
723
('\xf8\x80', FFFD*2), # only one continuation byte
724
('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
725
('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
726
# with invalid start byte of a 6-byte sequence (rfc2279)
727
('\xfc', FFFD), # only the start byte
728
('\xfc\xfc', FFFD*2), # 2 start bytes
729
('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
730
('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
733
('\xfe\x80\x80', FFFD*3),
735
('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
736
('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
737
('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
738
('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
739
u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
741
for n, (seq, res) in enumerate(sequences):
742
self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
743
self.assertEqual(seq.decode('utf-8', 'replace'), res)
744
self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
745
self.assertEqual(seq.decode('utf-8', 'ignore'),
746
res.replace(u'\uFFFD', ''))
590
748
def test_codecs_idna(self):
591
749
# Test whether trailing dot is preserved
592
750
self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")