3
def sanitize(text, options = {})
4
return text unless sanitizeable?(text)
5
tokenize(text, options).join
8
def sanitizeable?(text)
9
!(text.nil? || text.empty? || !text.index("<"))
13
def tokenize(text, options)
14
tokenizer = HTML::Tokenizer.new(text)
16
while token = tokenizer.next
17
node = Node.parse(nil, 0, 0, token, false)
18
process_node node, result, options
23
def process_node(node, result, options)
28
class FullSanitizer < Sanitizer
29
def sanitize(text, options = {})
31
# strip any comments, and if they have a newline at the end (ie. line with
32
# only a comment) strip that too
33
result.gsub!(/<!--(.*?)-->[\n]?/m, "") if result
34
# Recurse - handle all dirty nested tags
35
result == text ? result : sanitize(result, options)
38
def process_node(node, result, options)
39
result << node.to_s if node.class == HTML::Text
43
class LinkSanitizer < FullSanitizer
44
cattr_accessor :included_tags, :instance_writer => false
45
self.included_tags = Set.new(%w(a href))
47
def sanitizeable?(text)
48
!(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">")))
52
def process_node(node, result, options)
53
result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name)
57
class WhiteListSanitizer < Sanitizer
58
[:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
59
:allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr|
60
class_inheritable_accessor attr, :instance_writer => false
63
# A regular expression of the valid characters used to separate protocols like
64
# the ':' in 'http://foo.com'
65
self.protocol_separator = /:|(�*58)|(p)|(%|%)3A/
67
# Specifies a Set of HTML attributes that can have URIs.
68
self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
70
# Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
71
# to just escaping harmless tags like <font>
72
self.bad_tags = Set.new(%w(script))
74
# Specifies the default Set of tags that the #sanitize helper will allow unscathed.
75
self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
76
sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dt dd abbr
77
acronym a img blockquote del ins))
79
# Specifies the default Set of html attributes that the #sanitize helper will leave
81
self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
83
# Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
84
self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
85
feed svn urn aim rsync tag ssh sftp rtsp afs))
87
# Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
88
self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
89
border-color border-left-color border-right-color border-top-color clear color cursor direction display
90
elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
91
overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
92
speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
95
# Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
96
self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center
97
collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
98
nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
100
# Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
101
self.shorthand_css_properties = Set.new(%w(background border margin padding))
103
# Sanitizes a block of css code. Used by #sanitize when it comes across a style attribute
104
def sanitize_css(style)
106
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
109
if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ ||
110
style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$)\s*)*$/
115
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
116
if allowed_css_properties.include?(prop.downcase)
117
clean << prop + ': ' + val + ';'
118
elsif shorthand_css_properties.include?(prop.split('-')[0].downcase)
119
unless val.split().any? do |keyword|
120
!allowed_css_keywords.include?(keyword) &&
121
keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
123
clean << prop + ': ' + val + ';'
131
def tokenize(text, options)
132
options[:parent] = []
133
options[:attributes] ||= allowed_attributes
134
options[:tags] ||= allowed_tags
138
def process_node(node, result, options)
141
if node.closing == :close
142
options[:parent].shift
144
options[:parent].unshift node.name
147
process_attributes_for node, options
149
options[:tags].include?(node.name) ? node : nil
151
bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "<")
155
def process_attributes_for(node, options)
156
return unless node.attributes
157
node.attributes.keys.each do |attr_name|
158
value = node.attributes[attr_name].to_s
160
if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
161
node.attributes.delete(attr_name)
163
node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(CGI::unescapeHTML(value))
168
def contains_bad_protocols?(attr_name, value)
169
uri_attributes.include?(attr_name) &&
170
(value =~ /(^[^\/:]*):|(�*58)|(p)|(%|%)3A/ && !allowed_protocols.include?(value.split(protocol_separator).first))