1
"""Tests to ensure that the lxml tree builder generates good trees."""
9
LXML_VERSION = lxml.etree.LXML_VERSION
10
except ImportError, e:
15
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
21
from bs4.element import Comment, Doctype, SoupStrainer
22
from bs4.testing import skipIf
23
from bs4.tests import test_htmlparser
24
from bs4.testing import (
25
HTMLTreeBuilderSmokeTest,
26
XMLTreeBuilderSmokeTest,
33
"lxml seems not to be present, not testing its tree builder.")
34
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35
"""See ``HTMLTreeBuilderSmokeTest``."""
38
def default_builder(self):
39
return LXMLTreeBuilder()
41
def test_out_of_range_entity(self):
42
self.assertSoupEquals(
43
"<p>foo�bar</p>", "<p>foobar</p>")
44
self.assertSoupEquals(
45
"<p>foo�bar</p>", "<p>foobar</p>")
46
self.assertSoupEquals(
47
"<p>foo�bar</p>", "<p>foobar</p>")
49
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50
# test if an old version of lxml is installed.
53
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54
"Skipping doctype test for old version of lxml to avoid segfault.")
55
def test_empty_doctype(self):
56
soup = self.soup("<!DOCTYPE>")
57
doctype = soup.contents[0]
58
self.assertEqual("", doctype.strip())
60
def test_beautifulstonesoup_is_xml_parser(self):
61
# Make sure that the deprecated BSS class uses an xml builder
62
# if one is installed.
63
with warnings.catch_warnings(record=True) as w:
64
soup = BeautifulStoneSoup("<b />")
65
self.assertEqual(u"<b/>", unicode(soup.b))
66
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
68
def test_real_xhtml_document(self):
69
"""lxml strips the XML definition from an XHTML doc, which is fine."""
70
markup = b"""<?xml version="1.0" encoding="utf-8"?>
71
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
72
<html xmlns="http://www.w3.org/1999/xhtml">
73
<head><title>Hello.</title></head>
76
soup = self.soup(markup)
78
soup.encode("utf-8").replace(b"\n", b''),
79
markup.replace(b'\n', b'').replace(
80
b'<?xml version="1.0" encoding="utf-8"?>', b''))
85
"lxml seems not to be present, not testing its XML tree builder.")
86
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
87
"""See ``HTMLTreeBuilderSmokeTest``."""
90
def default_builder(self):
91
return LXMLTreeBuilderForXML()