248
248
copied = deepcopy(self.soup)
249
249
self.assertEqual(str(copied), str(self.soup))
251
def testUnicodePickle(self):
252
import cPickle as pickle
253
html = "<b>" + chr(0xc3) + "</b>"
254
soup = BeautifulSoup(html)
255
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
256
loaded = pickle.loads(dumped)
257
self.assertEqual(str(loaded), str(soup))
251
260
class WriteOnlyCode(SoupTest):
252
261
"Testing the modification of the tree."
357
366
# A very simple case
358
367
text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
359
368
soup = BeautifulSoup(text)
360
soup.find("div", id="nav").extract()
369
extracted = soup.find("div", id="nav").extract()
361
370
self.assertEqual(str(soup), "<html>Real content here.</html>")
371
self.assertEqual(str(extracted), '<div id="nav">Nav crap</div>')
363
373
# A simple case, a more complex test.
364
374
text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
554
564
self.assertEquals(soup.find(text=r).string, text)
555
565
self.assertEquals(soup.find(text=text).__class__, Declaration)
567
namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
569
soup = BeautifulSoup(namespaced_doctype)
570
self.assertEquals(soup.contents[0],
571
'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
572
self.assertEquals(soup.html.contents[0], 'foo')
557
574
def testEntityConversions(self):
558
575
text = "<<sacré bleu!>>"
559
576
soup = BeautifulStoneSoup(text)
665
683
def testGarbageInGarbageOut(self):
666
684
ascii = "<foo>a</foo>"
685
asciiSoup = BeautifulStoneSoup(ascii)
686
self.assertEquals(ascii, str(asciiSoup))
667
688
unicodeData = u"<foo>\u00FC</foo>"
668
689
utf8 = unicodeData.encode("utf-8")
670
asciiSoup = BeautifulStoneSoup(ascii)
671
self.assertEquals(ascii, str(asciiSoup))
673
utf8Soup = BeautifulStoneSoup(utf8)
690
self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
692
unicodeSoup = BeautifulStoneSoup(unicodeData)
693
self.assertEquals(unicodeData, unicode(unicodeSoup))
694
self.assertEquals(unicode(unicodeSoup.foo.string), u'\u00FC')
696
utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
674
697
self.assertEquals(utf8, str(utf8Soup))
675
698
self.assertEquals(utf8Soup.originalEncoding, "utf-8")
678
701
self.assertEquals(utf8, str(utf8Soup))
679
702
self.assertEquals(utf8Soup.originalEncoding, None)
681
unicodeSoup = BeautifulStoneSoup(unicodeData)
682
self.assertEquals(unicodeData, unicode(unicodeSoup))
684
705
def testHandleInvalidCodec(self):
685
706
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
696
717
utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
697
718
soup = BeautifulStoneSoup(euc_jp)
698
719
if soup.originalEncoding != "euc-jp":
699
raise "Test failed when parsing euc-jp document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it."
720
raise Exception("Test failed when parsing euc-jp document. "
721
"If you're running Python >=2.4, or you have "
722
"cjkcodecs installed, this is a real problem. "
723
"Otherwise, ignore it.")
701
725
self.assertEquals(soup.originalEncoding, "euc-jp")
702
726
self.assertEquals(str(soup), utf8)
708
732
def testRewrittenMetaTag(self):
709
733
no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
710
734
soup = BeautifulSoup(no_shift_jis_html)
711
self.assertEquals(soup.originalEncoding, "windows-1252")
713
736
# Beautiful Soup used to try to rewrite the meta tag even if the
714
737
# meta tag got filtered out by the strainer. This test makes
717
740
soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
718
741
self.assertEquals(soup.contents[0].name, 'pre')
720
shift_jis_html = '''<html><head>\n<meta http-equiv="Content-type" content="text/html; charset=x-sjis" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
743
meta_tag = ('<meta content="text/html; charset=x-sjis" '
744
'http-equiv="Content-type" />')
747
'<meta http-equiv="Content-language" content="ja" />'
748
'</head><body><pre>\n'
749
'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
750
'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
751
'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
752
'</pre></body></html>') % meta_tag
721
753
soup = BeautifulSoup(shift_jis_html)
722
754
if soup.originalEncoding != "shift-jis":
723
raise "Test failed when parsing shift-jis document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it."
755
raise Exception("Test failed when parsing shift-jis document "
756
"with meta tag '%s'."
757
"If you're running Python >=2.4, or you have "
758
"cjkcodecs installed, this is a real problem. "
759
"Otherwise, ignore it." % meta_tag)
724
760
self.assertEquals(soup.originalEncoding, "shift-jis")
725
self.assertEquals(str(soup), '<html><head>\n<meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</pre></body></html>')
762
content_type_tag = soup.meta['content']
763
self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
764
'charset=%SOUP-ENCODING%')
765
content_type = str(soup.meta)
766
index = content_type.find('charset=')
767
self.assertEqual(content_type[index:index+len('charset=utf8')+1],
769
content_type = soup.meta.__str__('shift-jis')
770
index = content_type.find('charset=')
771
self.assertEqual(content_type[index:index+len('charset=shift-jis')],
774
self.assertEquals(str(soup), (
776
'<meta content="text/html; charset=utf-8" '
777
'http-equiv="Content-type" />\n'
778
'<meta http-equiv="Content-language" content="ja" />'
779
'</head><body><pre>\n'
780
'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
781
'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
782
'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
783
'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
784
'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
785
'</pre></body></html>'))
726
786
self.assertEquals(soup.renderContents("shift-jis"),
727
787
shift_jis_html.replace('x-sjis', 'shift-jis'))
731
791
self.assertSoupEquals(soup.__str__("utf-8"),
732
792
isolatin.replace("ISO-Latin-1", "utf-8").replace("\xe9", "\xc3\xa9"))
735
794
def testHebrew(self):
736
795
iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
737
796
utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
755
814
utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
756
815
self.assertSoupEquals(utf_8)
818
class Whitewash(SoupTest):
819
"""Test whitespace preservation."""
821
def testPreservedWhitespace(self):
822
self.assertSoupEquals("<pre> </pre>")
823
self.assertSoupEquals("<pre> woo </pre>")
825
def testCollapsedWhitespace(self):
826
self.assertSoupEquals("<p> </p>", "<p> </p>")
758
829
if __name__ == '__main__':