1
# -*- encoding: utf-8 -*-
2
##############################################################################
4
# Copyright (C) 2009 EduSense BV (<http://www.edusense.nl>).
7
# This program is free software: you can redistribute it and/or modify
8
# it under the terms of the GNU General Public License as published by
9
# the Free Software Foundation, either version 3 of the License, or
10
# (at your option) any later version.
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
# GNU General Public License for more details.
17
# You should have received a copy of the GNU General Public License
18
# along with this program. If not, see <http://www.gnu.org/licenses/>.
20
##############################################################################
23
This module presents a browser like class to browse the web, fill and submit
24
forms and to parse the results back in. It is heavily based on BeautifulSoup.
28
from BeautifulSoup import BeautifulSoup
30
__all__ = ['urlsplit', 'urljoin', 'pathbase', 'urlbase', 'SoupForm',
36
Split an URL into scheme, host and path parts. Helper function.
39
parts = url.split(':')
41
url = ':'.join(parts[1:])
44
host, path = urllib.splithost(url)
45
return (scheme, host, path)
47
def urljoin(scheme, host, path, args = None):
49
Join scheme, host and path to a full URL.
50
Optional: add urlencoded args.
53
url = '%s://%s/%s' % (scheme or 'http', host, path)
55
url += '?%s' % urllib.urlencode(args)
60
Return the base for the path in order to satisfy relative paths.
63
if path and '/' in path:
64
return path[:path.rfind('/') +1]
69
Return the base URL for url in order to satisfy relative paths.
72
scheme, host, path = urlsplit(url)
73
return urljoin(scheme, host, pathbase(path))
75
class SoupForm(object):
77
A SoupForm is a representation of a HTML Form in BeautifulSoup terms.
78
It has a helper method __setitem__ to set or replace form fields.
79
It gets initiated from a soup object.
81
def __init__(self, soup, parent=False):
83
Parse the form attributes and fields from the soup. Make sure
84
to get the action right. When parent is set, then the parent
85
element is used as anchor for the search for form elements.
90
# Make sure to use base strings, not unicode
91
for attr, value in soup.attrMap.iteritems():
92
setattr(self, str(attr), str(value))
94
# Set right anchor point for harvest
96
self.soup = soup.parent
98
# Harvest input elements.
100
for item in self.soup.findAll('input'):
101
# Make sure to initialize to '' to avoid None strings to appear
103
self._args[str(item.get('name'))] = item.get('value') or ''
106
self.scheme, self.host, self.action = urlsplit(self.action)
107
self.action, args = urllib.splitquery(self.action)
109
args = args.split('&')
111
attr, value = urllib.splitvalue(arg)
112
self._extra_args[str(attr)] = value or ''
114
def __setitem__(self, name, value, force=False):
116
Set values for the form attributes when present
118
if name in self._args or force:
119
self._extra_args[name] = value
121
raise AttributeError('No such attribute: %s' % name)
123
def __getitem__(self, name):
125
Get a value. Set values overrule got values.
127
if name in self._extra_args:
128
return self._extra_args[name]
129
if name in self._args:
130
return self._args[name]
131
raise AttributeError('No attribute with name "%s" found.' % name)
133
def set(self, **kwargs):
135
Forcibly sets an attribute to the supplied value, even if it is not
136
part of the parsed form.
137
Can be useful in situations where forms are deliberatly chunked in
138
order to make it difficult to automate form requests, e.g. the
139
SWIFT BIC service, which uses JavaScript to add form attributes to an
142
for name, value in kwargs.iteritems():
143
self.__setitem__(name, value, force=True)
147
Return the field values as attributes, updated with the modified
150
args = dict(self._args)
151
args.update(self._extra_args)
154
class URLAgent(object):
156
Assistent object to ease HTTP(S) requests.
157
Mimics a normal web browser.
160
def __init__(self, *args, **kwargs):
161
super(URLAgent, self).__init__(*args, **kwargs)
162
self._extra_headers = {}
164
'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; us; rv:1.9.0.10) Gecko/2009042708 Fedora/3.0.10-1.fc9 Firefox/3.0.10',
165
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
166
'Accept-Language': 'en-us;q=1.0',
167
'Accept-Charset': 'UTF-8,*',
168
'Cache-Control': 'max-age=0'
171
def add_headers(self, **kwargs):
172
self._extra_headers.update(**kwargs)
176
Open a URL and set some vars based on the used URL.
177
Meant to be used on a single server.
179
self.scheme, self.host, self.path = urlsplit(URL)
182
self.agent = urllib.URLopener()
184
# Remove additional and unasked for User-Agent header
185
# Some servers choke on multiple User-Agent headers
186
self.agent.addheaders = []
187
headers = self._extra_headers.copy()
188
headers.update(self.headers)
189
for key, value in headers.iteritems():
190
self.agent.addheader(key, value)
193
request = self.agent.open(URL)
195
# Get and set cookies for next actions
196
attributes = request.info()
197
if attributes.has_key('set-cookie'):
198
self.agent.addheader('Cookie', attributes['set-cookie'])
201
self.agent.addheader('Referer', URL)
206
def submit(self, form, action=None, method=None, **kwargs):
208
Submit a SoupForm. Override missing attributes in action from our own
212
scheme, host, path = urlsplit(action)
214
scheme = form.scheme or self.scheme
215
host = form.host or self.host
217
method = (method or form.method).lower()
218
args = urllib.urlencode(kwargs or form.args())
220
if not action.startswith('/'):
222
action = pathbase(self.path) + action
224
function = getattr(self.agent, 'open_%s' % scheme)
226
return function('//%s%s' % (host, action), args)
227
return function('//%s%s?%s' % (host, action, args))