~ubuntu-branches/ubuntu/natty/python2.6/natty-security

« back to all changes in this revision

Viewing changes to Lib/test/test_unicode.py

Committer: Bazaar Package Importer
Author(s): Matthias Klose
Date: 2010-07-06 14:09:51 UTC
mfrom: (1.2.1 upstream) (10.1.24 sid)
Revision ID: james.westby@ubuntu.com-20100706140951-13i9odydj3ku1480

Tags: 2.6.5+20100706-1ubuntu1

* Merge with Debian; remaining changes:
- Regenerate the control file.

files removed:
debian/PVER-dbg.symbols.i386.in

debian/PVER-dbg.symbols.lpia.in

debian/libPVER.symbols.i386.in

debian/libPVER.symbols.lpia.in

files modified:
Doc/library/2to3.rst

Doc/library/math.rst

Doc/library/string.rst

Doc/library/stringio.rst

Doc/library/struct.rst

Doc/library/sys.rst

Doc/library/urllib.rst

Lib/plat-linux2/DLFCN.py

Lib/plat-linux2/IN.py

Lib/plat-linux2/TYPES.py

Lib/plat-linux2/regen

Lib/smtpd.py

Lib/test/test_audioop.py

Lib/test/test_codeccallbacks.py

Lib/test/test_parser.py

Lib/test/test_unicode.py

Misc/NEWS

Modules/audioop.c

Modules/parsermodule.c

Objects/floatobject.c

Objects/unicodeobject.c

Tools/unicode/mkstringprep.py

debian/PVER-minimal.README.Debian.in

debian/changelog

debian/control

debian/control.in

debian/libpython.symbols.in

debian/pyhtml2devhelp.py

debian/rules

Show diffs side-by-side

added added

removed removed

Lib/test/test_unicode.py

587

# * strict decoding testing for all of the

588

# UTF8_ERROR cases in PyUnicode_DecodeUTF8

589

590

def test_utf8_decode_valid_sequences(self):

591

sequences = [

592

# single byte

593

('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),

594

# 2 bytes

595

('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),

596

# 3 bytes

597

('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),

598

('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),

599

# 4 bytes

600

('\xF0\x90\x80\x80', u'\U00010000'),

601

('\xf4\x8f\xbf\xbf', u'\U0010FFFF')

602

]

603

for seq, res in sequences:

604

self.assertEqual(seq.decode('utf-8'), res)

605

606

for ch in map(unichr, range(0, sys.maxunicode)):

607

self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))

608

609

def test_utf8_decode_invalid_sequences(self):

610

# continuation bytes in a sequence of 2, 3, or 4 bytes

611

continuation_bytes = map(chr, range(0x80, 0xC0))

612

# start bytes of a 2-byte sequence equivalent to codepoints < 0x7F

613

invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))

614

# start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF

615

invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))

616

invalid_start_bytes = (

617

continuation_bytes + invalid_2B_seq_start_bytes +

618

invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))

619

)

620

621

for byte in invalid_start_bytes:

622

self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')

623

624

for sb in invalid_2B_seq_start_bytes:

625

for cb in continuation_bytes:

626

self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')

627

628

for sb in invalid_4B_seq_start_bytes:

629

for cb1 in continuation_bytes[:3]:

630

for cb3 in continuation_bytes[:3]:

631

self.assertRaises(UnicodeDecodeError,

632

(sb+cb1+'\x80'+cb3).decode, 'utf-8')

633

634

for cb in map(chr, range(0x80, 0xA0)):

635

self.assertRaises(UnicodeDecodeError,

636

('\xE0'+cb+'\x80').decode, 'utf-8')

637

self.assertRaises(UnicodeDecodeError,

638

('\xE0'+cb+'\xBF').decode, 'utf-8')

639

# XXX: surrogates shouldn't be valid UTF-8!

640

# see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf

641

# (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt

642

#for cb in map(chr, range(0xA0, 0xC0)):

643

#sys.__stdout__.write('\\xED\\x%02x\\x80\n' % ord(cb))

644

#self.assertRaises(UnicodeDecodeError,

645

#('\xED'+cb+'\x80').decode, 'utf-8')

646

#self.assertRaises(UnicodeDecodeError,

647

#('\xED'+cb+'\xBF').decode, 'utf-8')

648

for cb in map(chr, range(0x80, 0x90)):

649

self.assertRaises(UnicodeDecodeError,

650

('\xF0'+cb+'\x80\x80').decode, 'utf-8')

651

self.assertRaises(UnicodeDecodeError,

652

('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')

653

for cb in map(chr, range(0x90, 0xC0)):

654

self.assertRaises(UnicodeDecodeError,

655

('\xF4'+cb+'\x80\x80').decode, 'utf-8')

656

self.assertRaises(UnicodeDecodeError,

657

('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')

658

659

def test_issue8271(self):

660

# Issue #8271: during the decoding of an invalid UTF-8 byte sequence,

661

# only the start byte and the continuation byte(s) are now considered

662

# invalid, instead of the number of bytes specified by the start byte.

663

# See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,

664

# table 3-8, Row 2) for more information about the algorithm used.

665

FFFD = u'\ufffd'

666

sequences = [

667

# invalid start bytes

668

('\x80', FFFD), # continuation byte

669

('\x80\x80', FFFD*2), # 2 continuation bytes

670

('\xc0', FFFD),

671

('\xc0\xc0', FFFD*2),

672

('\xc1', FFFD),

673

('\xc1\xc0', FFFD*2),

674

('\xc0\xc1', FFFD*2),

675

# with start byte of a 2-byte sequence

676

('\xc2', FFFD), # only the start byte

677

('\xc2\xc2', FFFD*2), # 2 start bytes

678

('\xc2\xc2\xc2', FFFD*3), # 2 start bytes

679

('\xc2\x41', FFFD+'A'), # invalid continuation byte

680

# with start byte of a 3-byte sequence

681

('\xe1', FFFD), # only the start byte

682

('\xe1\xe1', FFFD*2), # 2 start bytes

683

('\xe1\xe1\xe1', FFFD*3), # 3 start bytes

684

('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes

685

('\xe1\x80', FFFD), # only 1 continuation byte

686

('\xe1\x41', FFFD+'A'), # invalid continuation byte

687

('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb

688

('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes

689

('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte

690

('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid

691

('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid

692

# with start byte of a 4-byte sequence

693

('\xf1', FFFD), # only the start byte

694

('\xf1\xf1', FFFD*2), # 2 start bytes

695

('\xf1\xf1\xf1', FFFD*3), # 3 start bytes

696

('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes

697

('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes

698

('\xf1\x80', FFFD), # only 1 continuation bytes

699

('\xf1\x80\x80', FFFD), # only 2 continuation bytes

700

('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid

701

('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid

702

('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid

703

('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid

704

('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid

705

('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid

706

('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid

707

('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),

708

('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),

709

('\xf1\xf1\x80\x41', FFFD*2+'A'),

710

('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),

711

# with invalid start byte of a 4-byte sequence (rfc2279)

712

('\xf5', FFFD), # only the start byte

713

('\xf5\xf5', FFFD*2), # 2 start bytes

714

('\xf5\x80', FFFD*2), # only 1 continuation byte

715

('\xf5\x80\x80', FFFD*3), # only 2 continuation byte

716

('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes

717

('\xf5\x80\x41', FFFD*2+'A'), # 1 valid cb and 1 invalid

718

('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),

719

('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),

720

# with invalid start byte of a 5-byte sequence (rfc2279)

721

('\xf8', FFFD), # only the start byte

722

('\xf8\xf8', FFFD*2), # 2 start bytes

723

('\xf8\x80', FFFD*2), # only one continuation byte

724

('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid

725

('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes

726

# with invalid start byte of a 6-byte sequence (rfc2279)

727

('\xfc', FFFD), # only the start byte

728

('\xfc\xfc', FFFD*2), # 2 start bytes

729

('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes

730

('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes

731

# invalid start byte

732

('\xfe', FFFD),

733

('\xfe\x80\x80', FFFD*3),

734

# other sequences

735

('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),

736

('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),

737

('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),

738

('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',

739

u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),

740

]

741

for n, (seq, res) in enumerate(sequences):

742

self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')

743

self.assertEqual(seq.decode('utf-8', 'replace'), res)

744

self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')

745

self.assertEqual(seq.decode('utf-8', 'ignore'),

746

res.replace(u'\uFFFD', ''))

747

590

748

def test_codecs_idna(self):

591

749

# Test whether trailing dot is preserved

592

750

self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")

Older »