~ubuntu-branches/ubuntu/karmic/beautifulsoup/karmic

« back to all changes in this revision

Viewing changes to BeautifulSoupTests.py

Committer: Bazaar Package Importer
Author(s): Decklin Foster
Date: 2008-06-30 19:04:07 UTC
mfrom: (1.1.6 upstream) (2.1.2 lenny)
Revision ID: james.westby@ubuntu.com-20080630190407-k08u6262n6ilm8ze

Tags: 3.0.7-1

http://bugs.debian.org/483579

http://bugs.debian.org/479414

New upstream version, fixes UnicodeDecodeError (Closes: #483579, #479414)

files modified:
BeautifulSoup.py

BeautifulSoupTests.py

PKG-INFO

debian/changelog

Show diffs side-by-side

added added

removed removed

BeautifulSoupTests.py

248

copied = deepcopy(self.soup)

249

self.assertEqual(str(copied), str(self.soup))

250

251

def testUnicodePickle(self):

252

import cPickle as pickle

253

html = "<b>" + chr(0xc3) + "</b>"

254

soup = BeautifulSoup(html)

255

dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)

256

loaded = pickle.loads(dumped)

257

self.assertEqual(str(loaded), str(soup))

258

259

251

260

class WriteOnlyCode(SoupTest):

252

261

"Testing the modification of the tree."

253

262

357

366

# A very simple case

358

367

text = '<html><div id="nav">Nav crap</div>Real content here.</html>'

359

368

soup = BeautifulSoup(text)

360

soup.find("div", id="nav").extract()

369

extracted = soup.find("div", id="nav").extract()

361

370

self.assertEqual(str(soup), "<html>Real content here.</html>")

371

self.assertEqual(str(extracted), '<div id="nav">Nav crap</div>')

362

372

363

373

# A simple case, a more complex test.

364

374

text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"

554

564

self.assertEquals(soup.find(text=r).string, text)

555

565

self.assertEquals(soup.find(text=text).__class__, Declaration)

556

566

567

namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'

568

'<html>foo</html>')

569

soup = BeautifulSoup(namespaced_doctype)

570

self.assertEquals(soup.contents[0],

571

'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')

572

self.assertEquals(soup.html.contents[0], 'foo')

573

557

574

def testEntityConversions(self):

558

575

text = "<<sacré bleu!>>"

559

576

soup = BeautifulStoneSoup(text)

648

665

soup = BeautifulStoneSoup(validURL)

649

666

self.assertEquals(str(soup), validURL)

650

667

668

651

669

class EncodeRed(SoupTest):

652

670

"""Tests encoding conversion, Unicode conversion, and Microsoft

653

671

smart quote fixes."""

664

682

665

683

def testGarbageInGarbageOut(self):

666

684

ascii = "<foo>a</foo>"

685

asciiSoup = BeautifulStoneSoup(ascii)

686

self.assertEquals(ascii, str(asciiSoup))

687

667

688

unicodeData = u"<foo>\u00FC</foo>"

668

689

utf8 = unicodeData.encode("utf-8")

669

670

asciiSoup = BeautifulStoneSoup(ascii)

671

self.assertEquals(ascii, str(asciiSoup))

672

673

utf8Soup = BeautifulStoneSoup(utf8)

690

self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')

691

692

unicodeSoup = BeautifulStoneSoup(unicodeData)

693

self.assertEquals(unicodeData, unicode(unicodeSoup))

694

self.assertEquals(unicode(unicodeSoup.foo.string), u'\u00FC')

695

696

utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')

674

697

self.assertEquals(utf8, str(utf8Soup))

675

698

self.assertEquals(utf8Soup.originalEncoding, "utf-8")

676

699

678

701

self.assertEquals(utf8, str(utf8Soup))

679

702

self.assertEquals(utf8Soup.originalEncoding, None)

680

703

681

unicodeSoup = BeautifulStoneSoup(unicodeData)

682

self.assertEquals(unicodeData, unicode(unicodeSoup))

683

704

684

705

def testHandleInvalidCodec(self):

685

706

for bad_encoding in ['.utf8', '...', 'utF---16.!']:

696

717

utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"

697

718

soup = BeautifulStoneSoup(euc_jp)

698

719

if soup.originalEncoding != "euc-jp":

699

raise "Test failed when parsing euc-jp document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it."

720

raise Exception("Test failed when parsing euc-jp document. "

721

"If you're running Python >=2.4, or you have "

722

"cjkcodecs installed, this is a real problem. "

723

"Otherwise, ignore it.")

700

724

701

725

self.assertEquals(soup.originalEncoding, "euc-jp")

702

726

self.assertEquals(str(soup), utf8)

708

732

def testRewrittenMetaTag(self):

709

733

no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''

710

734

soup = BeautifulSoup(no_shift_jis_html)

711

self.assertEquals(soup.originalEncoding, "windows-1252")

712

735

713

736

# Beautiful Soup used to try to rewrite the meta tag even if the

714

737

# meta tag got filtered out by the strainer. This test makes

717

740

soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)

718

741

self.assertEquals(soup.contents[0].name, 'pre')

719

742

720

shift_jis_html = '''<html><head>\n<meta http-equiv="Content-type" content="text/html; charset=x-sjis" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''

743

meta_tag = ('<meta content="text/html; charset=x-sjis" '

744

'http-equiv="Content-type" />')

745

shift_jis_html = (

746

'<html><head>\n%s\n'

747

'<meta http-equiv="Content-language" content="ja" />'

748

'</head><body><pre>\n'

749

'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'

750

'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'

751

'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'

752

'</pre></body></html>') % meta_tag

721

753

soup = BeautifulSoup(shift_jis_html)

722

754

if soup.originalEncoding != "shift-jis":

723

raise "Test failed when parsing shift-jis document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it."

755

raise Exception("Test failed when parsing shift-jis document "

756

"with meta tag '%s'."

757

"If you're running Python >=2.4, or you have "

758

"cjkcodecs installed, this is a real problem. "

759

"Otherwise, ignore it." % meta_tag)

724

760

self.assertEquals(soup.originalEncoding, "shift-jis")

725

self.assertEquals(str(soup), '<html><head>\n<meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</pre></body></html>')

761

762

content_type_tag = soup.meta['content']

763

self.assertEquals(content_type_tag[content_type_tag.find('charset='):],

764

'charset=%SOUP-ENCODING%')

765

content_type = str(soup.meta)

766

index = content_type.find('charset=')

767

self.assertEqual(content_type[index:index+len('charset=utf8')+1],

768

'charset=utf-8')

769

content_type = soup.meta.__str__('shift-jis')

770

index = content_type.find('charset=')

771

self.assertEqual(content_type[index:index+len('charset=shift-jis')],

772

'charset=shift-jis')

773

774

self.assertEquals(str(soup), (

775

'<html><head>\n'

776

'<meta content="text/html; charset=utf-8" '

777

'http-equiv="Content-type" />\n'

778

'<meta http-equiv="Content-language" content="ja" />'

779

'</head><body><pre>\n'

780

'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'

781

'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'

782

'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'

783

'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'

784

'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'

785

'</pre></body></html>'))

726

786

self.assertEquals(soup.renderContents("shift-jis"),

727

787

shift_jis_html.replace('x-sjis', 'shift-jis'))

728

788

731

791

self.assertSoupEquals(soup.__str__("utf-8"),

732

792

isolatin.replace("ISO-Latin-1", "utf-8").replace("\xe9", "\xc3\xa9"))

733

793

734

735

794

def testHebrew(self):

736

795

iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'

737

796

utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'

755

814

utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"

756

815

self.assertSoupEquals(utf_8)

757

816

817

818

class Whitewash(SoupTest):

819

"""Test whitespace preservation."""

820

821

def testPreservedWhitespace(self):

822

self.assertSoupEquals("<pre> </pre>")

823

self.assertSoupEquals("<pre> woo </pre>")

824

825

def testCollapsedWhitespace(self):

826

self.assertSoupEquals("<p> </p>", "<p> </p>")

827

828

758

829

if __name__ == '__main__':

759

830

unittest.main()

Older »