1
# -*- coding: utf-8 -*-
2
"""Unit tests for Beautiful Soup.
4
These tests make sure the Beautiful Soup works as it should. If you
5
find a bug in Beautiful Soup, the best way to express it is as a test
6
case like this that fails."""
9
from BeautifulSoup import *
11
class SoupTest(unittest.TestCase):
13
def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup):
14
"""Parse the given text and make sure its string rep is the other
18
self.assertEqual(str(c(toParse)), rep)
20
class FollowThatTag(SoupTest):
22
"Tests the various ways of fetching tags from a soup."
29
<b href="foo" id="x">4</a>
30
<ac width=100>4</ac>"""
31
self.soup = BeautifulStoneSoup(ml)
33
def testFindAllByName(self):
34
matching = self.soup('a')
35
self.assertEqual(len(matching), 2)
36
self.assertEqual(matching[0].name, 'a')
37
self.assertEqual(matching, self.soup.findAll('a'))
38
self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
40
def testFindAllByAttribute(self):
41
matching = self.soup.findAll(id='x')
42
self.assertEqual(len(matching), 2)
43
self.assertEqual(matching[0].name, 'a')
44
self.assertEqual(matching[1].name, 'b')
46
matching2 = self.soup.findAll(attrs={'id' : 'x'})
47
self.assertEqual(matching, matching2)
49
strainer = SoupStrainer(attrs={'id' : 'x'})
50
self.assertEqual(matching, self.soup.findAll(strainer))
52
self.assertEqual(len(self.soup.findAll(id=None)), 1)
54
self.assertEqual(len(self.soup.findAll(width=100)), 1)
55
self.assertEqual(len(self.soup.findAll(junk=None)), 5)
56
self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
58
self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
59
self.assertEqual(len(self.soup.findAll(junk=True)), 0)
61
self.assertEqual(len(self.soup.findAll(junk=True)), 0)
62
self.assertEqual(len(self.soup.findAll(href=True)), 1)
64
def testFindallByClass(self):
65
soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
66
self.assertEqual(soup.find('a', '1').string, "Bar")
68
def testFindAllByList(self):
69
matching = self.soup(['a', 'ac'])
70
self.assertEqual(len(matching), 3)
72
def testFindAllByHash(self):
73
matching = self.soup({'a' : True, 'b' : True})
74
self.assertEqual(len(matching), 4)
76
def testFindAllText(self):
77
soup = BeautifulSoup("<html>\xbb</html>")
78
self.assertEqual(soup.findAll(text=re.compile('.*')),
81
def testFindAllByRE(self):
84
self.assertEqual(len(self.soup(r)), 3)
86
def testFindAllByMethod(self):
87
def matchTagWhereIDMatchesName(tag):
88
return tag.name == tag.get('id')
90
matching = self.soup.findAll(matchTagWhereIDMatchesName)
91
self.assertEqual(len(matching), 2)
92
self.assertEqual(matching[0].name, 'a')
94
def testParents(self):
95
soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
97
self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
98
self.assertEquals(b.findParent('ul')['a'], 'b')
100
PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
103
soup = self.PROXIMITY_TEST
104
b = soup.find('b', {'id' : 2})
105
self.assertEquals(b.findNext('b')['id'], '3')
106
self.assertEquals(b.findNext('b')['id'], '3')
107
self.assertEquals(len(b.findAllNext('b')), 2)
108
self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
110
def testPrevious(self):
111
soup = self.PROXIMITY_TEST
112
b = soup.find('b', {'id' : 3})
113
self.assertEquals(b.findPrevious('b')['id'], '2')
114
self.assertEquals(b.findPrevious('b')['id'], '2')
115
self.assertEquals(len(b.findAllPrevious('b')), 2)
116
self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
119
SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
121
def testNextSibling(self):
122
soup = self.SIBLING_TEST
124
b = soup.find(tag, {'id' : 2})
125
self.assertEquals(b.findNext(tag)['id'], '2.1')
126
self.assertEquals(b.findNextSibling(tag)['id'], '3')
127
self.assertEquals(b.findNextSibling(tag)['id'], '3')
128
self.assertEquals(len(b.findNextSiblings(tag)), 2)
129
self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
131
def testPreviousSibling(self):
132
soup = self.SIBLING_TEST
134
b = soup.find(tag, {'id' : 3})
135
self.assertEquals(b.findPrevious(tag)['id'], '2.1')
136
self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
137
self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
138
self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
139
self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
141
def testTextNavigation(self):
142
soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
143
baz = soup.find(text='Baz')
144
self.assertEquals(baz.findParent("i")['id'], '1')
145
self.assertEquals(baz.findNext(text='Blee'), 'Blee')
146
self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
147
self.assertEquals(baz.findNextSibling(text='Blargh'), None)
148
self.assertEquals(baz.findNextSibling('hr')['id'], '1')
150
class SiblingRivalry(SoupTest):
151
"Tests the nextSibling and previousSibling navigation."
153
def testSiblings(self):
154
soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
155
secondLI = soup.find('li').nextSibling
156
self.assert_(secondLI.name == 'li' and secondLI.string == '2')
157
self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
158
self.assertEquals(soup.find('p').nextSibling, 'B')
159
self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
161
class TagsAreObjectsToo(SoupTest):
162
"Tests the various built-in functions of Tag objects."
165
soup = BeautifulSoup("<top>1<b>2</b>3</top>")
166
self.assertEquals(len(soup.top), 3)
168
class StringEmUp(SoupTest):
169
"Tests the use of 'string' as an alias for a tag's only content."
171
def testString(self):
172
s = BeautifulSoup("<b>foo</b>")
173
self.assertEquals(s.b.string, 'foo')
175
def testLackOfString(self):
176
s = BeautifulSoup("<b>f<i>e</i>o</b>")
177
self.assert_(not s.b.string)
179
class ThatsMyLimit(SoupTest):
180
"Tests the limit argument."
182
def testBasicLimits(self):
183
s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
184
self.assertEquals(len(s.findAll('br')), 4)
185
self.assertEquals(len(s.findAll('br', limit=2)), 2)
186
self.assertEquals(len(s('br', limit=2)), 2)
188
class OnlyTheLonely(SoupTest):
189
"Tests the parseOnly argument to the constructor."
193
x.append('<a id="%s">' % i)
194
for j in range(100,103):
195
x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
200
strainer = SoupStrainer("b")
201
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
202
self.assertEquals(len(soup), 15)
204
strainer = SoupStrainer(id=re.compile("100.*"))
205
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
206
self.assertEquals(len(soup), 5)
208
strainer = SoupStrainer(text=re.compile("10[01].*"))
209
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
210
self.assertEquals(len(soup), 10)
212
strainer = SoupStrainer(text=lambda(x):x[8]=='3')
213
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
214
self.assertEquals(len(soup), 3)
216
class PickleMeThis(SoupTest):
217
"Testing features like pickle and deepcopy."
220
self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
221
"http://www.w3.org/TR/REC-html40/transitional.dtd">
224
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
225
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
226
<link rev="made" href="mailto:leonardr@segfault.org">
227
<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
228
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
229
<meta name="author" content="Leonard Richardson">
232
<a href="foo">foo</a>
233
<a href="foo"><b>bar</b></a>
237
self.soup = BeautifulSoup(self.page)
239
def testPickle(self):
241
dumped = pickle.dumps(self.soup, 2)
242
loaded = pickle.loads(dumped)
243
self.assertEqual(loaded.__class__, BeautifulSoup)
244
self.assertEqual(str(loaded), str(self.soup))
246
def testDeepcopy(self):
247
from copy import deepcopy
248
copied = deepcopy(self.soup)
249
self.assertEqual(str(copied), str(self.soup))
251
class WriteOnlyCode(SoupTest):
252
"Testing the modification of the tree."
254
def testModifyAttributes(self):
255
soup = BeautifulSoup('<a id="1"></a>')
257
self.assertEqual(soup.renderContents(), '<a id="2"></a>')
259
self.assertEqual(soup.renderContents(), '<a></a>')
260
soup.a['id2'] = 'foo'
261
self.assertEqual(soup.renderContents(), '<a id2="foo"></a>')
263
def testNewTagCreation(self):
264
"Makes sure tags don't step on each others' toes."
265
soup = BeautifulSoup()
268
a['href'] = 'http://foo.com/'
269
self.assertRaises(KeyError, lambda : ol['href'])
271
def testTagReplacement(self):
272
# Make sure you can replace an element with itself.
273
text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
274
soup = BeautifulSoup(text)
276
soup.c.replaceWith(c)
277
self.assertEquals(str(soup), text)
280
soup = BeautifulSoup("<b>Argh!</b>")
281
soup.find(text="Argh!").replaceWith("Hooray!")
282
newText = soup.find(text="Hooray!")
284
self.assertEqual(newText.previous, b)
285
self.assertEqual(newText.parent, b)
286
self.assertEqual(newText.previous.next, newText)
287
self.assertEqual(newText.next, None)
289
# A more complex case
290
soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
291
soup.b.insert(1, "Hooray!")
292
newText = soup.find(text="Hooray!")
293
self.assertEqual(newText.previous, "Argh!")
294
self.assertEqual(newText.previous.next, newText)
296
self.assertEqual(newText.previousSibling, "Argh!")
297
self.assertEqual(newText.previousSibling.nextSibling, newText)
299
self.assertEqual(newText.nextSibling, None)
300
self.assertEqual(newText.next, soup.c)
302
text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
303
soup = BeautifulSoup(text)
304
no, show = soup.findAll('b')
306
self.assertEquals(str(soup), "<html>There's business like <b>no</b> business</html>")
309
soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
310
tag = Tag(soup, 'magictag')
312
soup.a.insert(1, tag)
316
theText = tag.find(text=True)
317
findText = b.find(text="Find")
319
self.assertEqual(findText.next, tag)
320
self.assertEqual(tag.previous, findText)
321
self.assertEqual(b.nextSibling, tag)
322
self.assertEqual(tag.previousSibling, b)
323
self.assertEqual(tag.nextSibling, c)
324
self.assertEqual(c.previousSibling, tag)
326
self.assertEqual(theText.next, c)
327
self.assertEqual(c.previous, theText)
329
# Aand... incredibly complex.
330
soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
335
weText = a.find(text="We")
336
soup.b.replaceWith(soup.f)
337
self.assertEqual(str(soup), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
339
self.assertEqual(f.previous, weText)
340
self.assertEqual(weText.next, f)
341
self.assertEqual(f.previousSibling, weText)
342
self.assertEqual(f.nextSibling, None)
343
self.assertEqual(weText.nextSibling, f)
345
def testAppend(self):
346
doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
347
soup = BeautifulSoup(doc)
348
second_para = soup('p')[1]
349
bold = soup.find('b')
350
soup('p')[1].append(soup.find('b'))
351
self.assertEqual(bold.parent, second_para)
352
self.assertEqual(str(soup),
353
"<p>Don't leave me .</p> "
354
"<p>Don't leave me.<b>here</b></p>")
356
def testTagExtraction(self):
358
text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
359
soup = BeautifulSoup(text)
360
soup.find("div", id="nav").extract()
361
self.assertEqual(str(soup), "<html>Real content here.</html>")
363
# A simple case, a more complex test.
364
text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
365
soup = BeautifulStoneSoup(text)
367
numbers, roman, letters = soup("a")
369
self.assertEqual(roman.parent, doc)
370
oldPrevious = roman.previous
371
endOfThisTag = roman.nextSibling.previous
372
self.assertEqual(oldPrevious, "2")
373
self.assertEqual(roman.next, "i")
374
self.assertEqual(endOfThisTag, "ii")
375
self.assertEqual(roman.previousSibling, numbers)
376
self.assertEqual(roman.nextSibling, letters)
379
self.assertEqual(roman.parent, None)
380
self.assertEqual(roman.previous, None)
381
self.assertEqual(roman.next, "i")
382
self.assertEqual(letters.previous, '2')
383
self.assertEqual(roman.previousSibling, None)
384
self.assertEqual(roman.nextSibling, None)
385
self.assertEqual(endOfThisTag.next, None)
386
self.assertEqual(roman.b.contents[0].next, None)
387
self.assertEqual(numbers.nextSibling, letters)
388
self.assertEqual(letters.previousSibling, numbers)
389
self.assertEqual(len(doc.contents), 2)
390
self.assertEqual(doc.contents[0], numbers)
391
self.assertEqual(doc.contents[1], letters)
393
# A more complex case.
394
text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
395
soup = BeautifulStoneSoup(text)
396
one = soup.find(text="1")
397
three = soup.find(text="3")
400
self.assertEqual(one.next, three)
401
self.assertEqual(three.previous, one)
402
self.assertEqual(one.parent.nextSibling, three)
403
self.assertEqual(three.previousSibling, soup.a)
405
class TheManWithoutAttributes(SoupTest):
406
"Test attribute access"
408
def testHasKey(self):
409
text = "<foo attr='bar'>"
410
self.assertEquals(BeautifulSoup(text).foo.has_key('attr'), True)
412
class QuoteMeOnThat(SoupTest):
414
def testQuotedAttributeValues(self):
415
self.assertSoupEquals("<foo attr='bar'></foo>",
416
'<foo attr="bar"></foo>')
418
text = """<foo attr='bar "brawls" happen'>a</foo>"""
419
soup = BeautifulSoup(text)
420
self.assertEquals(soup.renderContents(), text)
422
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
423
newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
424
self.assertSoupEquals(soup.renderContents(), newText)
426
self.assertSoupEquals('<this is="really messed up & stuff">',
427
'<this is="really messed up & stuff"></this>')
429
# This is not what the original author had in mind, but it's
430
# a legitimate interpretation of what they wrote.
431
self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""",
432
'<a href="foo</a>, </a><a href="></a>, <a href="bar">baz</a>')
434
# SGMLParser generates bogus parse events when attribute values
435
# contain embedded brackets, but at least Beautiful Soup fixes
437
self.assertSoupEquals('<a b="<a>">', '<a b="<a>"></a><a>"></a>')
438
self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah',
439
"""<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
443
class YoureSoLiteral(SoupTest):
445
def testLiteralMode(self):
446
text = "<script>if (i<imgs.length)</script><b>Foo</b>"
447
soup = BeautifulSoup(text)
448
self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
449
self.assertEqual(soup.b.contents[0], "Foo")
451
def testTextArea(self):
452
text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
453
soup = BeautifulSoup(text)
454
self.assertEqual(soup.textarea.contents[0],
455
"<b>This is an example of an HTML tag</b><&<&")
457
class OperatorOverload(SoupTest):
458
"Our operators do it all! Call now!"
460
def testTagNameAsFind(self):
461
"Tests that referencing a tag name as a member delegates to find()."
462
soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
463
self.assertEqual(soup.b.i, soup.find('b').find('i'))
464
self.assertEqual(soup.b.i.string, 'bar')
465
self.assertEqual(soup.b['id'], '1')
466
self.assertEqual(soup.b.contents[0], 'foo')
467
self.assert_(not soup.a)
469
#Test the .fooTag variant of .foo.
470
self.assertEqual(soup.bTag.iTag.string, 'bar')
471
self.assertEqual(soup.b.iTag.string, 'bar')
472
self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
474
class NestableEgg(SoupTest):
475
"""Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
477
def testParaInsideBlockquote(self):
478
soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
479
self.assertEqual(soup.blockquote.p.b.string, 'Foo')
480
self.assertEqual(soup.blockquote.b.string, 'Foo')
481
self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
483
def testNestedTables(self):
484
text = """<table id="1"><tr><td>Here's another table:
485
<table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
486
soup = BeautifulSoup(text)
487
self.assertEquals(soup.table.table.td.string, 'Juicy text')
488
self.assertEquals(len(soup.findAll('table')), 2)
489
self.assertEquals(len(soup.table.findAll('table')), 1)
490
self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
493
text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
494
soup = BeautifulSoup(text)
495
self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
497
text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
498
<tfoot><tr>Baz</tr></tfoot></table>"""
499
soup = BeautifulSoup(text)
500
self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
502
def testBadNestedTables(self):
503
soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
504
self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
506
class CleanupOnAisleFour(SoupTest):
507
"""Here we test cleanup of text that breaks SGMLParser or is just
510
def testSelfClosingtag(self):
511
self.assertEqual(str(BeautifulSoup("Foo<br/>Bar").find('br')),
514
self.assertSoupEquals('<p>test1<br/>test2</p>',
515
'<p>test1<br />test2</p>')
517
text = '<p>test1<selfclosing>test2'
518
soup = BeautifulStoneSoup(text)
519
self.assertEqual(str(soup),
520
'<p>test1<selfclosing>test2</selfclosing></p>')
522
soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
523
self.assertEqual(str(soup),
524
'<p>test1<selfclosing />test2</p>')
526
def testSelfClosingTagOrNot(self):
527
text = "<item><link>http://foo.com/</link></item>"
528
self.assertEqual(BeautifulStoneSoup(text).renderContents(), text)
529
self.assertEqual(BeautifulSoup(text).renderContents(),
530
'<item><link />http://foo.com/</item>')
533
xml = "<root>foo<![CDATA[foobar]]>bar</root>"
534
self.assertSoupEquals(xml, xml)
535
r = re.compile("foo.*bar")
536
soup = BeautifulSoup(xml)
537
self.assertEquals(soup.find(text=r).string, "foobar")
538
self.assertEquals(soup.find(text=r).__class__, CData)
540
def testComments(self):
541
xml = "foo<!--foobar-->baz"
542
self.assertSoupEquals(xml)
543
r = re.compile("foo.*bar")
544
soup = BeautifulSoup(xml)
545
self.assertEquals(soup.find(text=r).string, "foobar")
546
self.assertEquals(soup.find(text="foobar").__class__, Comment)
548
def testDeclaration(self):
549
xml = "foo<!DOCTYPE foobar>baz"
550
self.assertSoupEquals(xml)
551
r = re.compile(".*foo.*bar")
552
soup = BeautifulSoup(xml)
553
text = "DOCTYPE foobar"
554
self.assertEquals(soup.find(text=r).string, text)
555
self.assertEquals(soup.find(text=text).__class__, Declaration)
557
def testEntityConversions(self):
558
text = "<<sacré bleu!>>"
559
soup = BeautifulStoneSoup(text)
560
self.assertSoupEquals(text)
562
xmlEnt = BeautifulStoneSoup.XML_ENTITIES
563
htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
564
xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
566
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
567
self.assertEquals(str(soup), "<<sacré bleu!>>")
569
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
570
self.assertEquals(str(soup), "<<sacré bleu!>>")
572
soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
573
self.assertEquals(unicode(soup), u"<<sacr\xe9 bleu!>>")
575
# Make sure the "XML", "HTML", and "XHTML" settings work.
576
text = "<™'"
577
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
578
self.assertEquals(unicode(soup), u"<™'")
580
soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
581
self.assertEquals(unicode(soup), u"<\u2122'")
583
soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
584
self.assertEquals(unicode(soup), u"<\u2122'")
586
invalidEntity = "foo&#bar;baz"
587
soup = BeautifulStoneSoup\
589
convertEntities=htmlEnt)
590
self.assertEquals(str(soup), invalidEntity)
592
def testNonBreakingSpaces(self):
593
soup = BeautifulSoup("<a> </a>",
594
convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
595
self.assertEquals(unicode(soup), u"<a>\xa0\xa0</a>")
597
def testWhitespaceInDeclaration(self):
598
self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
600
def testJunkInDeclaration(self):
601
self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
603
def testIncompleteDeclaration(self):
604
self.assertSoupEquals('a<!b <p>c')
606
def testEntityReplacement(self):
607
self.assertSoupEquals('<b>hello there</b>')
609
def testEntitiesInAttributeValues(self):
610
self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>')
611
self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>')
613
soup = BeautifulSoup('<x t=">™">',
614
convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
615
self.assertEquals(unicode(soup), u'<x t=">\u2122"></x>')
617
uri = "http://crummy.com?sacré&bleu"
618
link = '<a href="%s"></a>' % uri
619
soup = BeautifulSoup(link)
620
self.assertEquals(unicode(soup), link)
621
#self.assertEquals(unicode(soup.a['href']), uri)
623
soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
624
self.assertEquals(unicode(soup),
625
link.replace("é", u"\xe9"))
627
uri = "http://crummy.com?sacré&bleu"
628
link = '<a href="%s"></a>' % uri
629
soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
630
self.assertEquals(unicode(soup.a['href']),
631
uri.replace("é", u"\xe9"))
633
def testNakedAmpersands(self):
634
html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
635
soup = BeautifulStoneSoup("AT&T ", **html)
636
self.assertEquals(str(soup), 'AT&T ')
638
nakedAmpersandInASentence = "AT&T was Ma Bell"
639
soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
640
self.assertEquals(str(soup), \
641
nakedAmpersandInASentence.replace('&','&'))
643
invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
644
validURL = invalidURL.replace('&','&')
645
soup = BeautifulStoneSoup(invalidURL)
646
self.assertEquals(str(soup), validURL)
648
soup = BeautifulStoneSoup(validURL)
649
self.assertEquals(str(soup), validURL)
651
class EncodeRed(SoupTest):
652
"""Tests encoding conversion, Unicode conversion, and Microsoft
653
smart quote fixes."""
655
def testUnicodeDammitStandalone(self):
656
markup = "<foo>\x92</foo>"
657
dammit = UnicodeDammit(markup)
658
self.assertEquals(dammit.unicode, "<foo>’</foo>")
660
hebrew = "\xed\xe5\xec\xf9"
661
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
662
self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
663
self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
665
def testGarbageInGarbageOut(self):
666
ascii = "<foo>a</foo>"
667
unicodeData = u"<foo>\u00FC</foo>"
668
utf8 = unicodeData.encode("utf-8")
670
asciiSoup = BeautifulStoneSoup(ascii)
671
self.assertEquals(ascii, str(asciiSoup))
673
utf8Soup = BeautifulStoneSoup(utf8)
674
self.assertEquals(utf8, str(utf8Soup))
675
self.assertEquals(utf8Soup.originalEncoding, "utf-8")
677
utf8Soup = BeautifulStoneSoup(unicodeData)
678
self.assertEquals(utf8, str(utf8Soup))
679
self.assertEquals(utf8Soup.originalEncoding, None)
681
unicodeSoup = BeautifulStoneSoup(unicodeData)
682
self.assertEquals(unicodeData, unicode(unicodeSoup))
684
def testHandleInvalidCodec(self):
685
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
686
soup = BeautifulSoup("Räksmörgås", fromEncoding=bad_encoding)
687
self.assertEquals(soup.originalEncoding, 'utf-8')
689
def testUnicodeSearch(self):
690
html = u'<html><body><h1>Räksmörgås</h1></body></html>'
691
soup = BeautifulSoup(html)
692
self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås')
694
def testRewrittenXMLHeader(self):
695
euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
696
utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
697
soup = BeautifulStoneSoup(euc_jp)
698
if soup.originalEncoding != "euc-jp":
699
raise "Test failed when parsing euc-jp document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it."
701
self.assertEquals(soup.originalEncoding, "euc-jp")
702
self.assertEquals(str(soup), utf8)
704
old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
705
new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>"
706
self.assertSoupEquals(old_text, new_text)
708
def testRewrittenMetaTag(self):
709
no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
710
soup = BeautifulSoup(no_shift_jis_html)
711
self.assertEquals(soup.originalEncoding, "windows-1252")
713
# Beautiful Soup used to try to rewrite the meta tag even if the
714
# meta tag got filtered out by the strainer. This test makes
715
# sure that doesn't happen.
716
strainer = SoupStrainer('pre')
717
soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
718
self.assertEquals(soup.contents[0].name, 'pre')
720
shift_jis_html = '''<html><head>\n<meta http-equiv="Content-type" content="text/html; charset=x-sjis" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
721
soup = BeautifulSoup(shift_jis_html)
722
if soup.originalEncoding != "shift-jis":
723
raise "Test failed when parsing shift-jis document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it."
724
self.assertEquals(soup.originalEncoding, "shift-jis")
725
self.assertEquals(str(soup), '<html><head>\n<meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</pre></body></html>')
726
self.assertEquals(soup.renderContents("shift-jis"),
727
shift_jis_html.replace('x-sjis', 'shift-jis'))
729
isolatin ="""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
730
soup = BeautifulSoup(isolatin)
731
self.assertSoupEquals(soup.__str__("utf-8"),
732
isolatin.replace("ISO-Latin-1", "utf-8").replace("\xe9", "\xc3\xa9"))
735
def testHebrew(self):
736
iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
737
utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
738
soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
739
self.assertEquals(str(soup), utf8)
741
def testSmartQuotesNotSoSmartAnymore(self):
742
self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
743
'‘Foo’ <!--blah-->')
745
def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
746
smartQuotes = "Il a dit, \x8BSacré bleu!\x9b"
747
soup = BeautifulSoup(smartQuotes)
748
self.assertEquals(str(soup),
749
'Il a dit, ‹Sacré bleu!›')
750
soup = BeautifulSoup(smartQuotes, convertEntities="html")
751
self.assertEquals(str(soup),
752
'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
754
def testDontSeeSmartQuotesWhereThereAreNone(self):
755
utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
756
self.assertSoupEquals(utf_8)
758
if __name__ == '__main__':