1
require 'html/tokenizer'
3
require 'html/selector'
4
require 'html/sanitizer'
7
# A top-level HTML document. You give it a body of text, and it will parse that
8
# text into a tree of nodes.
9
class Document #:nodoc:
11
# The root of the parsed document.
14
# Create a new Document from the given text.
15
def initialize(text, strict=false, xml=false)
16
tokenizer = Tokenizer.new(text)
18
node_stack = [ @root ]
19
while token = tokenizer.next
20
node = Node.parse(node_stack.last, tokenizer.line, tokenizer.position, token, strict)
22
node_stack.last.children << node unless node.tag? && node.closing == :close
24
if node_stack.length > 1 && node.closing == :close
25
if node_stack.last.name == node.name
26
if node_stack.last.children.empty?
27
node_stack.last.children << Text.new(node_stack.last, node.line, node.position, "")
31
open_start = node_stack.last.position - 20
32
open_start = 0 if open_start < 0
33
close_start = node.position - 20
34
close_start = 0 if close_start < 0
36
ignoring attempt to close #{node_stack.last.name} with #{node.name}
37
opened at byte #{node_stack.last.position}, line #{node_stack.last.line}
38
closed at byte #{node.position}, line #{node.line}
39
attributes at open: #{node_stack.last.attributes.inspect}
40
text around open: #{text[open_start,40].inspect}
41
text around close: #{text[close_start,40].inspect}
43
strict ? raise(msg) : warn(msg)
45
elsif !node.childless?(xml) && node.closing != :close
52
# Search the tree for (and return) the first node that matches the given
53
# conditions. The conditions are interpreted differently for different node
54
# types, see HTML::Text#find and HTML::Tag#find.
56
@root.find(conditions)
59
# Search the tree for (and return) all nodes that match the given
60
# conditions. The conditions are interpreted differently for different node
61
# types, see HTML::Text#find and HTML::Tag#find.
62
def find_all(conditions)
63
@root.find_all(conditions)