1
# Miro - an RSS based video player application
2
# Copyright (C) 2005-2010 Participatory Culture Foundation
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
# In addition, as a special exception, the copyright holders give
19
# permission to link the code of portions of this program with the OpenSSL
22
# You must obey the GNU General Public License in all respects for all of
23
# the code used other than OpenSSL. If you modify file(s) with this
24
# exception, you may extend this exception to your version of the file(s),
25
# but you are not obligated to do so. If you do not wish to do so, delete
26
# this exception statement from your version. If you delete this exception
27
# statement from all source files in the program, then also delete it here.
29
"""``miro.xhtmltools`` -- XML related utility functions.
32
import xml.sax.saxutils
35
from urllib import quote, quote_plus, unquote
36
from HTMLParser import HTMLParser, HTMLParseError
40
class XHTMLifier(HTMLParser):
41
"""Very simple parser to convert HTML to XHTML
44
# FIXME - this should probably be rewritten to use StringIO.
45
def convert(self, data, add_top_tags=False, filter_font_tags=False):
46
"""Converts an HTML data unicode string to an XHTML data
51
self.output = u'<html><head></head><body>'
55
self.filter_font_tags = filter_font_tags
58
while len(self.stack) > 0:
59
temp = self.stack.pop()
60
self.output += u'</'+temp+'>'
62
self.output += u'</body></html>'
65
except HTMLParseError:
66
logging.warn("xhtmlifier: parse exception")
67
logging.debug("data: '%s'", data)
69
def handle_starttag(self, tag, attrs):
70
if tag.lower() == 'br':
71
self.output += u'<br/>'
75
if not (tag.lower() == 'font' and self.filter_font_tags):
76
self.output += u'<' + tag
79
self.output += (u' ' +
82
xml.sax.saxutils.quoteattr(attr[0]))
84
self.output += (u' ' +
87
xml.sax.saxutils.quoteattr(attr[1]))
89
self.stack.append(tag)
91
def handle_endtag(self, tag):
92
if tag.lower() != 'br' and len(self.stack) > 1:
93
temp = self.stack.pop()
94
if not (tag.lower() == 'font' and self.filter_font_tags):
95
self.output += u'</'+temp+u'>'
96
while temp != tag and len(self.stack) > 1:
97
temp = self.stack.pop()
98
self.output += u'</' + temp + u'>'
100
def handle_startendtag(self, tag, attrs):
101
self.output += u'<' + tag + u'/>'
103
def handle_data(self, data):
104
data = data.replace(u'&', u'&')
105
data = data.replace(u'<', u'<')
108
def handle_charref(self, name):
109
self.output += u'&#' + name + ';'
111
def handle_entityref(self, name):
112
self.output += u'&' + name + ';'
115
"""Parses HTML entities in data"""
116
return xml.sax.saxutils.unescape(data)
119
"""Encodes string for use in a URL"""
120
if isinstance(data, unicode):
121
data = data.encode('utf-8', 'replace')
124
return unicode(quote(data))
126
def xhtmlify(data, add_top_tags=False, filter_font_tags=False):
127
"""Returns XHTMLified version of HTML document"""
129
ret = x.convert(data, add_top_tags, filter_font_tags)
131
# if we got a bad return, try it again without filtering font
133
if ret is None and filter_font_tags:
135
ret = x.convert(data, add_top_tags, filter_font_tags=False)
137
# if that's still bad, try converting " to ".
138
# this fixes bug #10095 where Google Video items are sometimes
142
ret = x.convert(data.replace(""", '"'), add_top_tags,
143
filter_font_tags=False)
150
XML_HEADER_RE = re.compile("^\<\?xml\s*(.*?)\s*\?\>(.*)", re.S)
152
def fix_xml_header(data, charset):
153
"""Adds a <?xml ?> header to the given xml data or replaces an
154
existing one without a charset with one that has a charset
156
header = XML_HEADER_RE.match(data)
158
# print "Adding header %s" % charset
159
return '<?xml version="1.0" encoding="%s"?>%s' % (charset, data)
161
xml_decl = header.expand('\\1')
162
the_rest = header.expand('\\2')
163
if xml_decl.find('encoding') != -1:
166
# print "Changing header to include charset"
167
return '<?xml %s encoding="%s"?>%s' % (xml_decl, charset, the_rest)
170
HTML_HEADER_RE = re.compile(
171
u"^(.*)\<\s*head\s*(.*?)\s*\>(.*?)\</\s*head\s*\>(.*)", re.I | re.S)
173
def fix_html_header(data, charset):
174
"""Adds a <meta http-equiv="Content-Type" content="text/html;
175
charset=blah"> tag to an HTML document
177
Since we're only feeding this to our own HTML Parser anyway, we
178
don't care that it might bung up XHTML.
180
header = HTML_HEADER_RE.match(data)
182
# something is very wrong with this HTML
185
head_tags = header.expand('\\3')
186
# this isn't exactly robust, but neither is scraping HTML
187
if head_tags.lower().find('content-type') != -1:
190
return header.expand('\\1<head\\2><meta http-equiv="Content-Type" content="text/html; charset=') + charset + header.expand('">\\3</head>\\4')
192
def url_encode_dict(orig):
193
"""Converts a Python dictionary to data suitable for a POST or GET
197
for key, val in orig.items():
198
if isinstance(val, list) or isinstance(val, tuple):
200
output.append('%s=%s' % (quote_plus(key), quote_plus(v)))
201
elif isinstance(val, basestring):
202
output.append('%s=%s' % (quote_plus(key), quote_plus(orig[key])))
204
logging.warning("url_encode_dict: trying to encode non-string: '%s'", repr(val))
205
return '&'.join(output)
207
def multipart_encode(post_vars, files):
208
# Generate a random 64bit number for our boundaries
209
boundary = 'dp%s' % hex(random.getrandbits(64))[2:-1]
211
if post_vars is not None:
212
for key, value in post_vars.items():
213
output.append('--%s\r\n' % boundary)
214
output.append('Content-Disposition: form-data; name="%s"\r\n\r\n' %
216
if isinstance(value, unicode):
217
value = value.encode('utf8', 'xmlcharrefreplace')
219
output.append('\r\n')
220
if files is not None:
221
for key in files.keys():
222
output.append('--%s\r\n' % boundary)
223
output.append('Content-Disposition: form-data; name="%s"; filename="%s"\r\n' %
225
quote_plus(files[key]['filename'])))
226
output.append('Content-Type: %s\r\n\r\n' % files[key]['mimetype'])
228
output.append(files[key]['handle'].read())
229
output.append('\r\n')
230
files[key]['handle'].close()
231
output.append('--%s--\n' % boundary)
232
return (''.join(output), boundary)