~jonas-drange/ubuntu-start-page/1252899-mobile-friendly

« back to all changes in this revision

Viewing changes to src/BeautifulSoup/BeautifulSoupTests.py

  • Committer: Matthew Nuzum
  • Date: 2008-04-18 01:58:53 UTC
  • Revision ID: matthew.nuzum@canonical.com-20080418015853-2b8rf979z2c2exxl
adding files

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
# -*- coding: utf-8 -*-
 
2
"""Unit tests for Beautiful Soup.
 
3
 
 
4
These tests make sure the Beautiful Soup works as it should. If you
 
5
find a bug in Beautiful Soup, the best way to express it is as a test
 
6
case like this that fails."""
 
7
 
 
8
import unittest
 
9
from BeautifulSoup import *
 
10
 
 
11
class SoupTest(unittest.TestCase):
 
12
 
 
13
    def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup):
 
14
        """Parse the given text and make sure its string rep is the other
 
15
        given text."""
 
16
        if rep == None:
 
17
            rep = toParse
 
18
        self.assertEqual(str(c(toParse)), rep)
 
19
 
 
20
class FollowThatTag(SoupTest):
 
21
 
 
22
    "Tests the various ways of fetching tags from a soup."
 
23
 
 
24
    def setUp(self):
 
25
        ml = """
 
26
        <a id="x">1</a>
 
27
        <A id="a">2</a>
 
28
        <b id="b">3</a>
 
29
        <b href="foo" id="x">4</a>
 
30
        <ac width=100>4</ac>"""
 
31
        self.soup = BeautifulStoneSoup(ml)
 
32
 
 
33
    def testFindAllByName(self):
 
34
        matching = self.soup('a')
 
35
        self.assertEqual(len(matching), 2)
 
36
        self.assertEqual(matching[0].name, 'a')
 
37
        self.assertEqual(matching, self.soup.findAll('a'))
 
38
        self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
 
39
 
 
40
    def testFindAllByAttribute(self):
 
41
        matching = self.soup.findAll(id='x')
 
42
        self.assertEqual(len(matching), 2)
 
43
        self.assertEqual(matching[0].name, 'a')
 
44
        self.assertEqual(matching[1].name, 'b')
 
45
 
 
46
        matching2 = self.soup.findAll(attrs={'id' : 'x'})
 
47
        self.assertEqual(matching, matching2)
 
48
 
 
49
        strainer = SoupStrainer(attrs={'id' : 'x'})
 
50
        self.assertEqual(matching, self.soup.findAll(strainer))
 
51
 
 
52
        self.assertEqual(len(self.soup.findAll(id=None)), 1)
 
53
 
 
54
        self.assertEqual(len(self.soup.findAll(width=100)), 1)
 
55
        self.assertEqual(len(self.soup.findAll(junk=None)), 5)
 
56
        self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
 
57
 
 
58
        self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
 
59
        self.assertEqual(len(self.soup.findAll(junk=True)), 0)
 
60
 
 
61
        self.assertEqual(len(self.soup.findAll(junk=True)), 0)
 
62
        self.assertEqual(len(self.soup.findAll(href=True)), 1)
 
63
 
 
64
    def testFindallByClass(self):
 
65
        soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
 
66
        self.assertEqual(soup.find('a', '1').string, "Bar")
 
67
 
 
68
    def testFindAllByList(self):
 
69
        matching = self.soup(['a', 'ac'])
 
70
        self.assertEqual(len(matching), 3)
 
71
 
 
72
    def testFindAllByHash(self):
 
73
        matching = self.soup({'a' : True, 'b' : True})
 
74
        self.assertEqual(len(matching), 4)
 
75
 
 
76
    def testFindAllText(self):
 
77
        soup = BeautifulSoup("<html>\xbb</html>")
 
78
        self.assertEqual(soup.findAll(text=re.compile('.*')),
 
79
                         [u'\xbb'])
 
80
 
 
81
    def testFindAllByRE(self):
 
82
        import re
 
83
        r = re.compile('a.*')
 
84
        self.assertEqual(len(self.soup(r)), 3)
 
85
 
 
86
    def testFindAllByMethod(self):
 
87
        def matchTagWhereIDMatchesName(tag):
 
88
            return tag.name == tag.get('id')
 
89
 
 
90
        matching = self.soup.findAll(matchTagWhereIDMatchesName)
 
91
        self.assertEqual(len(matching), 2)
 
92
        self.assertEqual(matching[0].name, 'a')
 
93
 
 
94
    def testParents(self):
 
95
        soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
 
96
        b = soup.b
 
97
        self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
 
98
        self.assertEquals(b.findParent('ul')['a'], 'b')
 
99
 
 
100
    PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
 
101
 
 
102
    def testNext(self):
 
103
        soup = self.PROXIMITY_TEST
 
104
        b = soup.find('b', {'id' : 2})
 
105
        self.assertEquals(b.findNext('b')['id'], '3')
 
106
        self.assertEquals(b.findNext('b')['id'], '3')
 
107
        self.assertEquals(len(b.findAllNext('b')), 2)
 
108
        self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
 
109
 
 
110
    def testPrevious(self):
 
111
        soup = self.PROXIMITY_TEST
 
112
        b = soup.find('b', {'id' : 3})
 
113
        self.assertEquals(b.findPrevious('b')['id'], '2')
 
114
        self.assertEquals(b.findPrevious('b')['id'], '2')
 
115
        self.assertEquals(len(b.findAllPrevious('b')), 2)
 
116
        self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
 
117
 
 
118
 
 
119
    SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
 
120
 
 
121
    def testNextSibling(self):
 
122
        soup = self.SIBLING_TEST
 
123
        tag = 'blockquote'
 
124
        b = soup.find(tag, {'id' : 2})
 
125
        self.assertEquals(b.findNext(tag)['id'], '2.1')
 
126
        self.assertEquals(b.findNextSibling(tag)['id'], '3')
 
127
        self.assertEquals(b.findNextSibling(tag)['id'], '3')
 
128
        self.assertEquals(len(b.findNextSiblings(tag)), 2)
 
129
        self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
 
130
 
 
131
    def testPreviousSibling(self):
 
132
        soup = self.SIBLING_TEST
 
133
        tag = 'blockquote'
 
134
        b = soup.find(tag, {'id' : 3})
 
135
        self.assertEquals(b.findPrevious(tag)['id'], '2.1')
 
136
        self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
 
137
        self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
 
138
        self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
 
139
        self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
 
140
 
 
141
    def testTextNavigation(self):
 
142
        soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
 
143
        baz = soup.find(text='Baz')
 
144
        self.assertEquals(baz.findParent("i")['id'], '1')
 
145
        self.assertEquals(baz.findNext(text='Blee'), 'Blee')
 
146
        self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
 
147
        self.assertEquals(baz.findNextSibling(text='Blargh'), None)
 
148
        self.assertEquals(baz.findNextSibling('hr')['id'], '1')
 
149
 
 
150
class SiblingRivalry(SoupTest):
 
151
    "Tests the nextSibling and previousSibling navigation."
 
152
 
 
153
    def testSiblings(self):
 
154
        soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
 
155
        secondLI = soup.find('li').nextSibling
 
156
        self.assert_(secondLI.name == 'li' and secondLI.string == '2')
 
157
        self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
 
158
        self.assertEquals(soup.find('p').nextSibling, 'B')
 
159
        self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
 
160
 
 
161
class TagsAreObjectsToo(SoupTest):
 
162
    "Tests the various built-in functions of Tag objects."
 
163
 
 
164
    def testLen(self):
 
165
        soup = BeautifulSoup("<top>1<b>2</b>3</top>")
 
166
        self.assertEquals(len(soup.top), 3)
 
167
 
 
168
class StringEmUp(SoupTest):
 
169
    "Tests the use of 'string' as an alias for a tag's only content."
 
170
 
 
171
    def testString(self):
 
172
        s = BeautifulSoup("<b>foo</b>")
 
173
        self.assertEquals(s.b.string, 'foo')
 
174
 
 
175
    def testLackOfString(self):
 
176
        s = BeautifulSoup("<b>f<i>e</i>o</b>")
 
177
        self.assert_(not s.b.string)
 
178
 
 
179
class ThatsMyLimit(SoupTest):
 
180
    "Tests the limit argument."
 
181
 
 
182
    def testBasicLimits(self):
 
183
        s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
 
184
        self.assertEquals(len(s.findAll('br')), 4)
 
185
        self.assertEquals(len(s.findAll('br', limit=2)), 2)
 
186
        self.assertEquals(len(s('br', limit=2)), 2)
 
187
 
 
188
class OnlyTheLonely(SoupTest):
 
189
    "Tests the parseOnly argument to the constructor."
 
190
    def setUp(self):
 
191
        x = []
 
192
        for i in range(1,6):
 
193
            x.append('<a id="%s">' % i)
 
194
            for j in range(100,103):
 
195
                x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
 
196
            x.append('</a>')
 
197
        self.x = ''.join(x)
 
198
 
 
199
    def testOnly(self):
 
200
        strainer = SoupStrainer("b")
 
201
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
 
202
        self.assertEquals(len(soup), 15)
 
203
 
 
204
        strainer = SoupStrainer(id=re.compile("100.*"))
 
205
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
 
206
        self.assertEquals(len(soup), 5)
 
207
 
 
208
        strainer = SoupStrainer(text=re.compile("10[01].*"))
 
209
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
 
210
        self.assertEquals(len(soup), 10)
 
211
 
 
212
        strainer = SoupStrainer(text=lambda(x):x[8]=='3')
 
213
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
 
214
        self.assertEquals(len(soup), 3)
 
215
 
 
216
class PickleMeThis(SoupTest):
 
217
    "Testing features like pickle and deepcopy."
 
218
 
 
219
    def setUp(self):
 
220
        self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
 
221
"http://www.w3.org/TR/REC-html40/transitional.dtd">
 
222
<html>
 
223
<head>
 
224
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 
225
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
 
226
<link rev="made" href="mailto:leonardr@segfault.org">
 
227
<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
 
228
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
 
229
<meta name="author" content="Leonard Richardson">
 
230
</head>
 
231
<body>
 
232
<a href="foo">foo</a>
 
233
<a href="foo"><b>bar</b></a>
 
234
</body>
 
235
</html>"""
 
236
 
 
237
        self.soup = BeautifulSoup(self.page)
 
238
 
 
239
    def testPickle(self):
 
240
        import pickle
 
241
        dumped = pickle.dumps(self.soup, 2)
 
242
        loaded = pickle.loads(dumped)
 
243
        self.assertEqual(loaded.__class__, BeautifulSoup)
 
244
        self.assertEqual(str(loaded), str(self.soup))
 
245
 
 
246
    def testDeepcopy(self):
 
247
        from copy import deepcopy
 
248
        copied = deepcopy(self.soup)
 
249
        self.assertEqual(str(copied), str(self.soup))
 
250
 
 
251
class WriteOnlyCode(SoupTest):
 
252
    "Testing the modification of the tree."
 
253
 
 
254
    def testModifyAttributes(self):
 
255
        soup = BeautifulSoup('<a id="1"></a>')
 
256
        soup.a['id'] = 2
 
257
        self.assertEqual(soup.renderContents(), '<a id="2"></a>')
 
258
        del(soup.a['id'])
 
259
        self.assertEqual(soup.renderContents(), '<a></a>')
 
260
        soup.a['id2'] = 'foo'
 
261
        self.assertEqual(soup.renderContents(), '<a id2="foo"></a>')
 
262
 
 
263
    def testNewTagCreation(self):
 
264
        "Makes sure tags don't step on each others' toes."
 
265
        soup = BeautifulSoup()
 
266
        a = Tag(soup, 'a')
 
267
        ol = Tag(soup, 'ol')
 
268
        a['href'] = 'http://foo.com/'
 
269
        self.assertRaises(KeyError, lambda : ol['href'])
 
270
 
 
271
    def testTagReplacement(self):
 
272
        # Make sure you can replace an element with itself.
 
273
        text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
 
274
        soup = BeautifulSoup(text)
 
275
        c = soup.c
 
276
        soup.c.replaceWith(c)
 
277
        self.assertEquals(str(soup), text)
 
278
 
 
279
        # A very simple case
 
280
        soup = BeautifulSoup("<b>Argh!</b>")
 
281
        soup.find(text="Argh!").replaceWith("Hooray!")
 
282
        newText = soup.find(text="Hooray!")
 
283
        b = soup.b
 
284
        self.assertEqual(newText.previous, b)
 
285
        self.assertEqual(newText.parent, b)
 
286
        self.assertEqual(newText.previous.next, newText)
 
287
        self.assertEqual(newText.next, None)
 
288
 
 
289
        # A more complex case
 
290
        soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
 
291
        soup.b.insert(1, "Hooray!")
 
292
        newText = soup.find(text="Hooray!")
 
293
        self.assertEqual(newText.previous, "Argh!")
 
294
        self.assertEqual(newText.previous.next, newText)
 
295
 
 
296
        self.assertEqual(newText.previousSibling, "Argh!")
 
297
        self.assertEqual(newText.previousSibling.nextSibling, newText)
 
298
 
 
299
        self.assertEqual(newText.nextSibling, None)
 
300
        self.assertEqual(newText.next, soup.c)
 
301
 
 
302
        text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
 
303
        soup = BeautifulSoup(text)
 
304
        no, show = soup.findAll('b')
 
305
        show.replaceWith(no)
 
306
        self.assertEquals(str(soup), "<html>There's  business like <b>no</b> business</html>")
 
307
 
 
308
        # Even more complex
 
309
        soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
 
310
        tag = Tag(soup, 'magictag')
 
311
        tag.insert(0, "the")
 
312
        soup.a.insert(1, tag)
 
313
 
 
314
        b = soup.b
 
315
        c = soup.c
 
316
        theText = tag.find(text=True)
 
317
        findText = b.find(text="Find")
 
318
 
 
319
        self.assertEqual(findText.next, tag)
 
320
        self.assertEqual(tag.previous, findText)
 
321
        self.assertEqual(b.nextSibling, tag)
 
322
        self.assertEqual(tag.previousSibling, b)
 
323
        self.assertEqual(tag.nextSibling, c)
 
324
        self.assertEqual(c.previousSibling, tag)
 
325
 
 
326
        self.assertEqual(theText.next, c)
 
327
        self.assertEqual(c.previous, theText)
 
328
 
 
329
        # Aand... incredibly complex.
 
330
        soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
 
331
        f = soup.f
 
332
        a = soup.a
 
333
        c = soup.c
 
334
        e = soup.e
 
335
        weText = a.find(text="We")
 
336
        soup.b.replaceWith(soup.f)
 
337
        self.assertEqual(str(soup), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
 
338
 
 
339
        self.assertEqual(f.previous, weText)
 
340
        self.assertEqual(weText.next, f)
 
341
        self.assertEqual(f.previousSibling, weText)
 
342
        self.assertEqual(f.nextSibling, None)
 
343
        self.assertEqual(weText.nextSibling, f)
 
344
 
 
345
    def testAppend(self):
 
346
       doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
 
347
       soup = BeautifulSoup(doc)
 
348
       second_para = soup('p')[1]
 
349
       bold = soup.find('b')
 
350
       soup('p')[1].append(soup.find('b'))
 
351
       self.assertEqual(bold.parent, second_para)
 
352
       self.assertEqual(str(soup),
 
353
                        "<p>Don't leave me .</p> "
 
354
                        "<p>Don't leave me.<b>here</b></p>")
 
355
 
 
356
    def testTagExtraction(self):
 
357
        # A very simple case
 
358
        text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
 
359
        soup = BeautifulSoup(text)
 
360
        soup.find("div", id="nav").extract()
 
361
        self.assertEqual(str(soup), "<html>Real content here.</html>")
 
362
 
 
363
        # A simple case, a more complex test.
 
364
        text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
 
365
        soup = BeautifulStoneSoup(text)
 
366
        doc = soup.doc
 
367
        numbers, roman, letters = soup("a")
 
368
 
 
369
        self.assertEqual(roman.parent, doc)
 
370
        oldPrevious = roman.previous
 
371
        endOfThisTag = roman.nextSibling.previous
 
372
        self.assertEqual(oldPrevious, "2")
 
373
        self.assertEqual(roman.next, "i")
 
374
        self.assertEqual(endOfThisTag, "ii")
 
375
        self.assertEqual(roman.previousSibling, numbers)
 
376
        self.assertEqual(roman.nextSibling, letters)
 
377
 
 
378
        roman.extract()
 
379
        self.assertEqual(roman.parent, None)
 
380
        self.assertEqual(roman.previous, None)
 
381
        self.assertEqual(roman.next, "i")
 
382
        self.assertEqual(letters.previous, '2')
 
383
        self.assertEqual(roman.previousSibling, None)
 
384
        self.assertEqual(roman.nextSibling, None)
 
385
        self.assertEqual(endOfThisTag.next, None)
 
386
        self.assertEqual(roman.b.contents[0].next, None)
 
387
        self.assertEqual(numbers.nextSibling, letters)
 
388
        self.assertEqual(letters.previousSibling, numbers)
 
389
        self.assertEqual(len(doc.contents), 2)
 
390
        self.assertEqual(doc.contents[0], numbers)
 
391
        self.assertEqual(doc.contents[1], letters)
 
392
 
 
393
        # A more complex case.
 
394
        text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
 
395
        soup = BeautifulStoneSoup(text)
 
396
        one = soup.find(text="1")
 
397
        three = soup.find(text="3")
 
398
        toExtract = soup.b
 
399
        soup.b.extract()
 
400
        self.assertEqual(one.next, three)
 
401
        self.assertEqual(three.previous, one)
 
402
        self.assertEqual(one.parent.nextSibling, three)
 
403
        self.assertEqual(three.previousSibling, soup.a)
 
404
 
 
405
class TheManWithoutAttributes(SoupTest):
 
406
    "Test attribute access"
 
407
 
 
408
    def testHasKey(self):
 
409
        text = "<foo attr='bar'>"
 
410
        self.assertEquals(BeautifulSoup(text).foo.has_key('attr'), True)
 
411
 
 
412
class QuoteMeOnThat(SoupTest):
 
413
    "Test quoting"
 
414
    def testQuotedAttributeValues(self):
 
415
        self.assertSoupEquals("<foo attr='bar'></foo>",
 
416
                              '<foo attr="bar"></foo>')
 
417
 
 
418
        text = """<foo attr='bar "brawls" happen'>a</foo>"""
 
419
        soup = BeautifulSoup(text)
 
420
        self.assertEquals(soup.renderContents(), text)
 
421
 
 
422
        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
 
423
        newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
 
424
        self.assertSoupEquals(soup.renderContents(), newText)
 
425
 
 
426
        self.assertSoupEquals('<this is="really messed up & stuff">',
 
427
                              '<this is="really messed up &amp; stuff"></this>')
 
428
 
 
429
        # This is not what the original author had in mind, but it's
 
430
        # a legitimate interpretation of what they wrote.
 
431
        self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""",
 
432
        '<a href="foo&lt;/a&gt;, &lt;/a&gt;&lt;a href="></a>, <a href="bar">baz</a>')
 
433
 
 
434
        # SGMLParser generates bogus parse events when attribute values
 
435
        # contain embedded brackets, but at least Beautiful Soup fixes
 
436
        # it up a little.
 
437
        self.assertSoupEquals('<a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a>')
 
438
        self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah',
 
439
                              """<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
 
440
 
 
441
 
 
442
 
 
443
class YoureSoLiteral(SoupTest):
 
444
    "Test literal mode."
 
445
    def testLiteralMode(self):
 
446
        text = "<script>if (i<imgs.length)</script><b>Foo</b>"
 
447
        soup = BeautifulSoup(text)
 
448
        self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
 
449
        self.assertEqual(soup.b.contents[0], "Foo")
 
450
 
 
451
    def testTextArea(self):
 
452
        text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
 
453
        soup = BeautifulSoup(text)
 
454
        self.assertEqual(soup.textarea.contents[0],
 
455
                         "<b>This is an example of an HTML tag</b><&<&")
 
456
 
 
457
class OperatorOverload(SoupTest):
 
458
    "Our operators do it all! Call now!"
 
459
 
 
460
    def testTagNameAsFind(self):
 
461
        "Tests that referencing a tag name as a member delegates to find()."
 
462
        soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
 
463
        self.assertEqual(soup.b.i, soup.find('b').find('i'))
 
464
        self.assertEqual(soup.b.i.string, 'bar')
 
465
        self.assertEqual(soup.b['id'], '1')
 
466
        self.assertEqual(soup.b.contents[0], 'foo')
 
467
        self.assert_(not soup.a)
 
468
 
 
469
        #Test the .fooTag variant of .foo.
 
470
        self.assertEqual(soup.bTag.iTag.string, 'bar')
 
471
        self.assertEqual(soup.b.iTag.string, 'bar')
 
472
        self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
 
473
 
 
474
class NestableEgg(SoupTest):
 
475
    """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
 
476
 
 
477
    def testParaInsideBlockquote(self):
 
478
        soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
 
479
        self.assertEqual(soup.blockquote.p.b.string, 'Foo')
 
480
        self.assertEqual(soup.blockquote.b.string, 'Foo')
 
481
        self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
 
482
 
 
483
    def testNestedTables(self):
 
484
        text = """<table id="1"><tr><td>Here's another table:
 
485
        <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
 
486
        soup = BeautifulSoup(text)
 
487
        self.assertEquals(soup.table.table.td.string, 'Juicy text')
 
488
        self.assertEquals(len(soup.findAll('table')), 2)
 
489
        self.assertEquals(len(soup.table.findAll('table')), 1)
 
490
        self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
 
491
                          'table')
 
492
 
 
493
        text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
 
494
        soup = BeautifulSoup(text)
 
495
        self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
 
496
 
 
497
        text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
 
498
        <tfoot><tr>Baz</tr></tfoot></table>"""
 
499
        soup = BeautifulSoup(text)
 
500
        self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
 
501
 
 
502
    def testBadNestedTables(self):
 
503
        soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
 
504
        self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
 
505
 
 
506
class CleanupOnAisleFour(SoupTest):
 
507
    """Here we test cleanup of text that breaks SGMLParser or is just
 
508
    obnoxious."""
 
509
 
 
510
    def testSelfClosingtag(self):
 
511
        self.assertEqual(str(BeautifulSoup("Foo<br/>Bar").find('br')),
 
512
                         '<br />')
 
513
 
 
514
        self.assertSoupEquals('<p>test1<br/>test2</p>',
 
515
                              '<p>test1<br />test2</p>')
 
516
 
 
517
        text = '<p>test1<selfclosing>test2'
 
518
        soup = BeautifulStoneSoup(text)
 
519
        self.assertEqual(str(soup),
 
520
                         '<p>test1<selfclosing>test2</selfclosing></p>')
 
521
 
 
522
        soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
 
523
        self.assertEqual(str(soup),
 
524
                         '<p>test1<selfclosing />test2</p>')
 
525
 
 
526
    def testSelfClosingTagOrNot(self):
 
527
        text = "<item><link>http://foo.com/</link></item>"
 
528
        self.assertEqual(BeautifulStoneSoup(text).renderContents(), text)
 
529
        self.assertEqual(BeautifulSoup(text).renderContents(),
 
530
                         '<item><link />http://foo.com/</item>')
 
531
 
 
532
    def testCData(self):
 
533
        xml = "<root>foo<![CDATA[foobar]]>bar</root>"
 
534
        self.assertSoupEquals(xml, xml)
 
535
        r = re.compile("foo.*bar")
 
536
        soup = BeautifulSoup(xml)
 
537
        self.assertEquals(soup.find(text=r).string, "foobar")
 
538
        self.assertEquals(soup.find(text=r).__class__, CData)
 
539
 
 
540
    def testComments(self):
 
541
        xml = "foo<!--foobar-->baz"
 
542
        self.assertSoupEquals(xml)
 
543
        r = re.compile("foo.*bar")
 
544
        soup = BeautifulSoup(xml)
 
545
        self.assertEquals(soup.find(text=r).string, "foobar")
 
546
        self.assertEquals(soup.find(text="foobar").__class__, Comment)
 
547
 
 
548
    def testDeclaration(self):
 
549
        xml = "foo<!DOCTYPE foobar>baz"
 
550
        self.assertSoupEquals(xml)
 
551
        r = re.compile(".*foo.*bar")
 
552
        soup = BeautifulSoup(xml)
 
553
        text = "DOCTYPE foobar"
 
554
        self.assertEquals(soup.find(text=r).string, text)
 
555
        self.assertEquals(soup.find(text=text).__class__, Declaration)
 
556
 
 
557
    def testEntityConversions(self):
 
558
        text = "&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;"
 
559
        soup = BeautifulStoneSoup(text)
 
560
        self.assertSoupEquals(text)
 
561
 
 
562
        xmlEnt = BeautifulStoneSoup.XML_ENTITIES
 
563
        htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
 
564
        xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
 
565
 
 
566
        soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
 
567
        self.assertEquals(str(soup), "<<sacr&eacute; bleu!>>")
 
568
 
 
569
        soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
 
570
        self.assertEquals(str(soup), "<<sacr&eacute; bleu!>>")
 
571
 
 
572
        soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
 
573
        self.assertEquals(unicode(soup), u"<<sacr\xe9 bleu!>>")
 
574
 
 
575
        # Make sure the "XML", "HTML", and "XHTML" settings work.
 
576
        text = "&lt;&trade;&apos;"
 
577
        soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
 
578
        self.assertEquals(unicode(soup), u"<&trade;'")
 
579
 
 
580
        soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
 
581
        self.assertEquals(unicode(soup), u"<\u2122&apos;")
 
582
 
 
583
        soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
 
584
        self.assertEquals(unicode(soup), u"<\u2122'")
 
585
 
 
586
        invalidEntity = "foo&#bar;baz"
 
587
        soup = BeautifulStoneSoup\
 
588
               (invalidEntity,
 
589
                convertEntities=htmlEnt)
 
590
        self.assertEquals(str(soup), invalidEntity)
 
591
 
 
592
    def testNonBreakingSpaces(self):
 
593
        soup = BeautifulSoup("<a>&nbsp;&nbsp;</a>",
 
594
                             convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
 
595
        self.assertEquals(unicode(soup), u"<a>\xa0\xa0</a>")
 
596
 
 
597
    def testWhitespaceInDeclaration(self):
 
598
        self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
 
599
 
 
600
    def testJunkInDeclaration(self):
 
601
        self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
 
602
 
 
603
    def testIncompleteDeclaration(self):
 
604
        self.assertSoupEquals('a<!b <p>c')
 
605
 
 
606
    def testEntityReplacement(self):
 
607
        self.assertSoupEquals('<b>hello&nbsp;there</b>')
 
608
 
 
609
    def testEntitiesInAttributeValues(self):
 
610
        self.assertSoupEquals('<x t="x&#241;">', '<x t="x\xc3\xb1"></x>')
 
611
        self.assertSoupEquals('<x t="x&#xf1;">', '<x t="x\xc3\xb1"></x>')
 
612
 
 
613
        soup = BeautifulSoup('<x t="&gt;&trade;">',
 
614
                             convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
 
615
        self.assertEquals(unicode(soup), u'<x t="&gt;\u2122"></x>')
 
616
 
 
617
        uri = "http://crummy.com?sacr&eacute;&amp;bleu"
 
618
        link = '<a href="%s"></a>' % uri
 
619
        soup = BeautifulSoup(link)
 
620
        self.assertEquals(unicode(soup), link)
 
621
        #self.assertEquals(unicode(soup.a['href']), uri)
 
622
 
 
623
        soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
 
624
        self.assertEquals(unicode(soup),
 
625
                          link.replace("&eacute;", u"\xe9"))
 
626
 
 
627
        uri = "http://crummy.com?sacr&eacute;&bleu"
 
628
        link = '<a href="%s"></a>' % uri
 
629
        soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
 
630
        self.assertEquals(unicode(soup.a['href']),
 
631
                          uri.replace("&eacute;", u"\xe9"))
 
632
 
 
633
    def testNakedAmpersands(self):
 
634
        html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
 
635
        soup = BeautifulStoneSoup("AT&T ", **html)
 
636
        self.assertEquals(str(soup), 'AT&amp;T ')
 
637
 
 
638
        nakedAmpersandInASentence = "AT&T was Ma Bell"
 
639
        soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
 
640
        self.assertEquals(str(soup), \
 
641
               nakedAmpersandInASentence.replace('&','&amp;'))
 
642
 
 
643
        invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
 
644
        validURL = invalidURL.replace('&','&amp;')
 
645
        soup = BeautifulStoneSoup(invalidURL)
 
646
        self.assertEquals(str(soup), validURL)
 
647
 
 
648
        soup = BeautifulStoneSoup(validURL)
 
649
        self.assertEquals(str(soup), validURL)
 
650
 
 
651
class EncodeRed(SoupTest):
 
652
    """Tests encoding conversion, Unicode conversion, and Microsoft
 
653
    smart quote fixes."""
 
654
 
 
655
    def testUnicodeDammitStandalone(self):
 
656
        markup = "<foo>\x92</foo>"
 
657
        dammit = UnicodeDammit(markup)
 
658
        self.assertEquals(dammit.unicode, "<foo>&#x2019;</foo>")
 
659
 
 
660
        hebrew = "\xed\xe5\xec\xf9"
 
661
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
 
662
        self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
 
663
        self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
 
664
 
 
665
    def testGarbageInGarbageOut(self):
 
666
        ascii = "<foo>a</foo>"
 
667
        unicodeData = u"<foo>\u00FC</foo>"
 
668
        utf8 = unicodeData.encode("utf-8")
 
669
 
 
670
        asciiSoup = BeautifulStoneSoup(ascii)
 
671
        self.assertEquals(ascii, str(asciiSoup))
 
672
 
 
673
        utf8Soup = BeautifulStoneSoup(utf8)
 
674
        self.assertEquals(utf8, str(utf8Soup))
 
675
        self.assertEquals(utf8Soup.originalEncoding, "utf-8")
 
676
 
 
677
        utf8Soup = BeautifulStoneSoup(unicodeData)
 
678
        self.assertEquals(utf8, str(utf8Soup))
 
679
        self.assertEquals(utf8Soup.originalEncoding, None)
 
680
 
 
681
        unicodeSoup = BeautifulStoneSoup(unicodeData)
 
682
        self.assertEquals(unicodeData, unicode(unicodeSoup))
 
683
 
 
684
    def testHandleInvalidCodec(self):
 
685
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
 
686
            soup = BeautifulSoup("Räksmörgås", fromEncoding=bad_encoding)
 
687
            self.assertEquals(soup.originalEncoding, 'utf-8')
 
688
 
 
689
    def testUnicodeSearch(self):
 
690
        html = u'<html><body><h1>Räksmörgås</h1></body></html>'
 
691
        soup = BeautifulSoup(html)
 
692
        self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås')
 
693
 
 
694
    def testRewrittenXMLHeader(self):
 
695
        euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
 
696
        utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
 
697
        soup = BeautifulStoneSoup(euc_jp)
 
698
        if soup.originalEncoding != "euc-jp":
 
699
            raise "Test failed when parsing euc-jp document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it."
 
700
 
 
701
        self.assertEquals(soup.originalEncoding, "euc-jp")
 
702
        self.assertEquals(str(soup), utf8)
 
703
 
 
704
        old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
 
705
        new_text = "<?xml version='1.0' encoding='utf-8'?><foo>&rsquo;</foo>"
 
706
        self.assertSoupEquals(old_text, new_text)
 
707
 
 
708
    def testRewrittenMetaTag(self):
 
709
        no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
 
710
        soup = BeautifulSoup(no_shift_jis_html)
 
711
        self.assertEquals(soup.originalEncoding, "windows-1252")
 
712
 
 
713
        # Beautiful Soup used to try to rewrite the meta tag even if the
 
714
        # meta tag got filtered out by the strainer. This test makes
 
715
        # sure that doesn't happen.
 
716
        strainer = SoupStrainer('pre')
 
717
        soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
 
718
        self.assertEquals(soup.contents[0].name, 'pre')
 
719
 
 
720
        shift_jis_html = '''<html><head>\n<meta http-equiv="Content-type" content="text/html; charset=x-sjis" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
 
721
        soup = BeautifulSoup(shift_jis_html)
 
722
        if soup.originalEncoding != "shift-jis":
 
723
            raise "Test failed when parsing shift-jis document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it."
 
724
        self.assertEquals(soup.originalEncoding, "shift-jis")
 
725
        self.assertEquals(str(soup), '<html><head>\n<meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</pre></body></html>')
 
726
        self.assertEquals(soup.renderContents("shift-jis"),
 
727
                          shift_jis_html.replace('x-sjis', 'shift-jis'))
 
728
 
 
729
        isolatin ="""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
 
730
        soup = BeautifulSoup(isolatin)
 
731
        self.assertSoupEquals(soup.__str__("utf-8"),
 
732
                              isolatin.replace("ISO-Latin-1", "utf-8").replace("\xe9", "\xc3\xa9"))
 
733
 
 
734
 
 
735
    def testHebrew(self):
 
736
        iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
 
737
        utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
 
738
        soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
 
739
        self.assertEquals(str(soup), utf8)
 
740
 
 
741
    def testSmartQuotesNotSoSmartAnymore(self):
 
742
        self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
 
743
                              '&lsquo;Foo&rsquo; <!--blah-->')
 
744
 
 
745
    def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
 
746
        smartQuotes = "Il a dit, \x8BSacr&eacute; bl&#101;u!\x9b"
 
747
        soup = BeautifulSoup(smartQuotes)
 
748
        self.assertEquals(str(soup),
 
749
                          'Il a dit, &lsaquo;Sacr&eacute; bl&#101;u!&rsaquo;')
 
750
        soup = BeautifulSoup(smartQuotes, convertEntities="html")
 
751
        self.assertEquals(str(soup),
 
752
                          'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
 
753
 
 
754
    def testDontSeeSmartQuotesWhereThereAreNone(self):
 
755
        utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
 
756
        self.assertSoupEquals(utf_8)
 
757
 
 
758
if __name__ == '__main__':
 
759
    unittest.main()