~facelessuser/beautifulsoup/lxml-fix

« back to all changes in this revision

Viewing changes to bs4/builder/_html5lib.py

  • Committer: Leonard Richardson
  • Date: 2019-07-21 19:50:49 UTC
  • Revision ID: leonardr@segfault.org-20190721195049-vzupkzxai3bx3r8t
Implemented line number tracking for html5lib.

Show diffs side-by-side

added added

removed removed

Lines of Context:
45
45
 
46
46
    features = [NAME, PERMISSIVE, HTML_5, HTML]
47
47
 
 
48
    # html5lib can tell us which line number and position in the
 
49
    # original file is the source of an element.
 
50
    TRACKS_LINE_NUMBERS = True
 
51
    
48
52
    def prepare_markup(self, markup, user_specified_encoding,
49
53
                       document_declared_encoding=None, exclude_encodings=None):
50
54
        # Store the user-specified encoding for use later on.
62
66
        if self.soup.parse_only is not None:
63
67
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
64
68
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
65
 
 
 
69
        self.underlying_builder.parser = parser
66
70
        extra_kwargs = dict()
67
71
        if not isinstance(markup, unicode):
68
72
            if new_html5lib:
70
74
            else:
71
75
                extra_kwargs['encoding'] = self.user_specified_encoding
72
76
        doc = parser.parse(markup, **extra_kwargs)
73
 
 
 
77
        
74
78
        # Set the character encoding detected by the tokenizer.
75
79
        if isinstance(markup, unicode):
76
80
            # We need to special-case this because html5lib sets
84
88
                # with other tree builders.
85
89
                original_encoding = original_encoding.name
86
90
            doc.original_encoding = original_encoding
87
 
 
 
91
        self.underlying_builder.parser = None
 
92
            
88
93
    def create_treebuilder(self, namespaceHTMLElements):
89
94
        self.underlying_builder = TreeBuilderForHtml5lib(
90
 
            namespaceHTMLElements, self.soup)
 
95
            namespaceHTMLElements, self.soup,
 
96
            store_line_numbers=self.store_line_numbers
 
97
        )
91
98
        return self.underlying_builder
92
99
 
93
100
    def test_fragment_to_document(self, fragment):
96
103
 
97
104
 
98
105
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
99
 
 
100
 
    def __init__(self, namespaceHTMLElements, soup=None):
 
106
    
 
107
    def __init__(self, namespaceHTMLElements, soup=None,
 
108
                 store_line_numbers=True, **kwargs):
101
109
        if soup:
102
110
            self.soup = soup
103
111
        else:
104
112
            from bs4 import BeautifulSoup
105
 
            self.soup = BeautifulSoup("", "html.parser")
 
113
            # TODO: Why is the parser 'html.parser' here? To avoid an
 
114
            # infinite loop?
 
115
            self.soup = BeautifulSoup(
 
116
                "", "html.parser", store_line_numbers=store_line_numbers,
 
117
                **kwargs
 
118
            )
106
119
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
107
120
 
 
121
        # This will be set later to an html5lib.html5parser.HTMLParser
 
122
        # object, which we can use to track the current line number.
 
123
        self.parser = None
 
124
        self.store_line_numbers = store_line_numbers
 
125
        
108
126
    def documentClass(self):
109
127
        self.soup.reset()
110
128
        return Element(self.soup, self.soup, None)
118
136
        self.soup.object_was_parsed(doctype)
119
137
 
120
138
    def elementClass(self, name, namespace):
121
 
        tag = self.soup.new_tag(name, namespace)
 
139
        kwargs = {}
 
140
        if self.parser and self.store_line_numbers:
 
141
            # This represents the point immediately after the end of the
 
142
            # tag. We don't know when the tag started, but we do know
 
143
            # where it ended -- the character just before this one.
 
144
            sourceline, sourcepos = self.parser.tokenizer.stream.position()
 
145
            kwargs['sourceline'] = sourceline
 
146
            kwargs['sourcepos'] = sourcepos-1
 
147
        tag = self.soup.new_tag(name, namespace, **kwargs)
 
148
 
122
149
        return Element(tag, self.soup, namespace)
123
150
 
124
151
    def commentClass(self, data):
126
153
 
127
154
    def fragmentClass(self):
128
155
        from bs4 import BeautifulSoup
 
156
        # TODO: Why is the parser 'html.parser' here? To avoid an
 
157
        # infinite loop?
129
158
        self.soup = BeautifulSoup("", "html.parser")
130
159
        self.soup.name = "[document_fragment]"
131
160
        return Element(self.soup, self.soup, None)