1
"""HTML form handling for web clients.
3
ClientForm is a Python module for handling HTML forms on the client
4
side, useful for parsing HTML forms, filling them in and returning the
5
completed forms to the server. It has developed from a port of Gisle
6
Aas' Perl module HTML::Form, from the libwww-perl library, but the
7
interface is not the same.
9
The most useful docstring is the one for HTMLForm.
12
RFC 1867: Form-based File Upload in HTML
13
RFC 2388: Returning Values from Forms: multipart/form-data
14
HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
15
HTML 4.01 Specification, W3C Recommendation 24 December 1999
18
Copyright 2002-2006 John J. Lee <jjl@pobox.com>
19
Copyright 2005 Gary Poster
20
Copyright 2005 Zope Corporation
21
Copyright 1998-2000 Gisle Aas.
23
This code is free software; you can redistribute it and/or modify it
24
under the terms of the BSD License (see the file COPYING included with
30
# Remove unescape_attr method
31
# Remove parser testing hack
32
# safeUrl()-ize action
33
# Really should to merge CC, CF, pp and mechanize as soon as mechanize
35
# Add url attribute to ParseError
36
# Switch to unicode throughout (would be 0.3.x)
37
# See Wichert Akkerman's 2004-01-22 message to c.l.py.
38
# Add charset parameter to Content-type headers? How to find value??
39
# Add some more functional tests
40
# Especially single and multiple file upload on the internet.
41
# Does file upload work when name is missing? Sourceforge tracker form
42
# doesn't like it. Check standards, and test with Apache. Test
43
# binary upload with Apache.
44
# Controls can have name=None (e.g. forms constructed partly with
45
# JavaScript), but find_control can't be told to find a control
46
# with that name, because None there means 'unspecified'. Can still
47
# get at by nr, but would be nice to be able to specify something
48
# equivalent to name=None, too.
49
# mailto submission & enctype text/plain
50
# I'm not going to fix this unless somebody tells me what real servers
51
# that want this encoding actually expect: If enctype is
52
# application/x-www-form-urlencoded and there's a FILE control present.
53
# Strictly, it should be 'name=data' (see HTML 4.01 spec., section
54
# 17.13.2), but I send "name=" ATM. What about multiple file upload??
56
# Would be nice, but I'm not going to do it myself:
57
# -------------------------------------------------
59
# Replace by_label etc. with moniker / selector concept. Allows, eg.,
60
# a choice between selection by value / id / label / element
61
# contents. Or choice between matching labels exactly or by
63
# Remove deprecated methods.
66
# XForms? Don't know if there's a need here.
83
def debug(msg, *args, **kwds):
86
_logger = logging.getLogger("ClientForm")
87
OPTIMIZATION_HACK = True
89
def debug(msg, *args, **kwds):
97
sys.exc_info()[2].tb_frame.f_back.f_back.f_code.co_name)
98
extended_msg = '%%s %s' % msg
99
extended_args = (caller_name,)+args
100
debug = _logger.debug(extended_msg, *extended_args, **kwds)
102
def _show_debug_messages():
103
global OPTIMIZATION_HACK
104
OPTIMIZATION_HACK = False
105
_logger.setLevel(logging.DEBUG)
106
handler = logging.StreamHandler(sys.stdout)
107
handler.setLevel(logging.DEBUG)
108
_logger.addHandler(handler)
110
import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
111
htmlentitydefs, re, random
112
from urlparse import urljoin
113
from cStringIO import StringIO
118
def deprecation(message):
121
def deprecation(message):
122
warnings.warn(message, DeprecationWarning, stacklevel=2)
126
CHUNK = 1024 # size of chunks fed to parser, in bytes
128
DEFAULT_ENCODING = "latin-1"
130
_compress_re = re.compile(r"\s+")
131
def compress_text(text): return _compress_re.sub(" ", text.strip())
133
# This version of urlencode is from my Python 1.5.2 back-port of the
134
# Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
135
# of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
136
def urlencode(query,doseq=False,):
137
"""Encode a sequence of two-element tuples or dictionary into a URL query \
140
If any values in the query arg are sequences and doseq is true, each
141
sequence element is converted to a separate parameter.
143
If the query arg is a sequence of two-element tuples, the order of the
144
parameters in the output will match the order of parameters in the
148
if hasattr(query,"items"):
150
query = query.items()
152
# it's a bother at times that strings and string-like objects are
155
# non-sequence items should not work with len()
157
# non-empty strings will fail this
158
if len(query) and type(query[0]) != types.TupleType:
160
# zero-length sequences of all types will get here and succeed,
161
# but that's a minor nit - since the original implementation
162
# allowed empty dicts that type of behavior probably should be
163
# preserved for consistency
165
ty,va,tb = sys.exc_info()
166
raise TypeError("not a valid non-string sequence or mapping "
171
# preserve old behavior
173
k = urllib.quote_plus(str(k))
174
v = urllib.quote_plus(str(v))
175
l.append(k + '=' + v)
178
k = urllib.quote_plus(str(k))
179
if type(v) == types.StringType:
180
v = urllib.quote_plus(v)
181
l.append(k + '=' + v)
182
elif type(v) == types.UnicodeType:
183
# is there a reasonable way to convert to ASCII?
184
# encode generates a string, but "replace" or "ignore"
185
# lose information and "strict" can raise UnicodeError
186
v = urllib.quote_plus(v.encode("ASCII","replace"))
187
l.append(k + '=' + v)
190
# is this a sufficient test for sequence-ness?
194
v = urllib.quote_plus(str(v))
195
l.append(k + '=' + v)
197
# loop over the sequence
199
l.append(k + '=' + urllib.quote_plus(str(elt)))
202
def unescape(data, entities, encoding=DEFAULT_ENCODING):
203
if data is None or "&" not in data:
206
def replace_entities(match, entities=entities, encoding=encoding):
209
return unescape_charref(ent[2:-1], encoding)
211
repl = entities.get(ent)
213
if type(repl) != type(""):
215
repl = repl.encode(encoding)
223
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
225
def unescape_charref(data, encoding):
226
name, base = data, 10
227
if name.startswith("x"):
228
name, base= name[1:], 16
229
uc = unichr(int(name, base))
234
repl = uc.encode(encoding)
236
repl = "&#%s;" % data
239
def get_entitydefs():
240
import htmlentitydefs
241
from codecs import latin_1_decode
244
htmlentitydefs.name2codepoint
245
except AttributeError:
247
for name, char in htmlentitydefs.entitydefs.items():
248
uc = latin_1_decode(char)[0]
249
if uc.startswith("&#") and uc.endswith(";"):
250
uc = unescape_charref(uc[2:-1], None)
251
entitydefs["&%s;" % name] = uc
253
for name, codepoint in htmlentitydefs.name2codepoint.items():
254
entitydefs["&%s;" % name] = unichr(codepoint)
261
except (TypeError, KeyError):
273
def choose_boundary():
274
"""Return a string usable as a multipart boundary."""
275
# follow IE and firefox
276
nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2])
277
return "-"*27 + nonce
279
# This cut-n-pasted MimeWriter from standard library is here so can add
280
# to HTTP headers rather than message body when appropriate. It also uses
281
# \r\n in place of \n. This is a bit nasty.
284
"""Generic MIME writer.
296
A MIME writer is much more primitive than a MIME parser. It
297
doesn't seek around on the output file, and it doesn't use large
298
amounts of buffer space, so you have to write the parts in the
299
order they should occur on the output file. It does buffer the
300
headers you add, allowing you to rearrange their order.
304
f = <open the output file>
306
...call w.addheader(key, value) 0 or more times...
310
f = w.startbody(content_type)
311
...call f.write(data) for body data...
315
w.startmultipartbody(subtype)
317
subwriter = w.nextpart()
318
...use the subwriter's methods to create the subpart...
321
The subwriter is another MimeWriter instance, and should be
322
treated in the same way as the toplevel MimeWriter. This way,
323
writing recursive body parts is easy.
325
Warning: don't forget to call lastpart()!
327
XXX There should be more state so calls made in the wrong order
332
- startbody() just returns the file passed to the constructor;
333
but don't use this knowledge, as it may be changed.
335
- startmultipartbody() actually returns a file as well;
336
this can be used to write the initial 'if you can read this your
337
mailer is not MIME-aware' message.
339
- If you call flushheaders(), the headers accumulated so far are
340
written out (and forgotten); this is useful if you don't need a
341
body part at all, e.g. for a subpart of type message/rfc822
342
that's (mis)used to store some header-like information.
344
- Passing a keyword argument 'prefix=<flag>' to addheader(),
345
start*body() affects where the header is inserted; 0 means
346
append at the end, 1 means insert at the start; default is
347
append for addheader(), but insert for start*body(), which use
348
it to determine where the Content-type header goes.
352
def __init__(self, fp, http_hdrs=None):
353
self._http_hdrs = http_hdrs
357
self._first_part = True
359
def addheader(self, key, value, prefix=0,
362
prefix is ignored if add_to_http_hdrs is true.
364
lines = value.split("\r\n")
365
while lines and not lines[-1]: del lines[-1]
366
while lines and not lines[0]: del lines[0]
368
value = "".join(lines)
369
self._http_hdrs.append((key, value))
371
for i in range(1, len(lines)):
372
lines[i] = " " + lines[i].strip()
373
value = "\r\n".join(lines) + "\r\n"
374
line = key + ": " + value
376
self._headers.insert(0, line)
378
self._headers.append(line)
380
def flushheaders(self):
381
self._fp.writelines(self._headers)
384
def startbody(self, ctype=None, plist=[], prefix=1,
385
add_to_http_hdrs=0, content_type=1):
387
prefix is ignored if add_to_http_hdrs is true.
389
if content_type and ctype:
390
for name, value in plist:
391
ctype = ctype + ';\r\n %s=%s' % (name, value)
392
self.addheader("Content-type", ctype, prefix=prefix,
393
add_to_http_hdrs=add_to_http_hdrs)
395
if not add_to_http_hdrs: self._fp.write("\r\n")
396
self._first_part = True
399
def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
400
add_to_http_hdrs=0, content_type=1):
401
boundary = boundary or choose_boundary()
402
self._boundary.append(boundary)
403
return self.startbody("multipart/" + subtype,
404
[("boundary", boundary)] + plist,
406
add_to_http_hdrs=add_to_http_hdrs,
407
content_type=content_type)
410
boundary = self._boundary[-1]
412
self._first_part = False
414
self._fp.write("\r\n")
415
self._fp.write("--" + boundary + "\r\n")
416
return self.__class__(self._fp)
421
boundary = self._boundary.pop()
422
self._fp.write("\r\n--" + boundary + "--\r\n")
425
class LocateError(ValueError): pass
426
class AmbiguityError(LocateError): pass
427
class ControlNotFoundError(LocateError): pass
428
class ItemNotFoundError(LocateError): pass
430
class ItemCountError(ValueError): pass
433
class ParseError(Exception): pass
436
class _AbstractFormParser:
437
"""forms attribute contains HTMLForm instances on completion."""
438
# thanks to Moshe Zadka for an example of sgmllib/htmllib usage
439
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
440
if entitydefs is None:
441
entitydefs = get_entitydefs()
442
self._entitydefs = entitydefs
443
self._encoding = encoding
448
self._current_label = None
449
self._current_form = None
451
self._optgroup = None
453
self._textarea = None
455
def do_base(self, attrs):
457
for key, value in attrs:
463
if self._current_label is not None:
465
if self._current_form is not None:
468
def start_form(self, attrs):
470
if self._current_form is not None:
471
raise ParseError("nested FORMs")
474
enctype = "application/x-www-form-urlencoded"
477
for key, value in attrs:
480
elif key == "action":
482
elif key == "method":
483
method = value.upper()
484
elif key == "enctype":
485
enctype = value.lower()
488
self._current_form = (name, action, method, enctype), d, controls
492
if self._current_label is not None:
494
if self._current_form is None:
495
raise ParseError("end of FORM before start")
496
self.forms.append(self._current_form)
497
self._current_form = None
499
def start_select(self, attrs):
501
if self._current_form is None:
502
raise ParseError("start of SELECT before start of FORM")
503
if self._select is not None:
504
raise ParseError("nested SELECTs")
505
if self._textarea is not None:
506
raise ParseError("SELECT inside TEXTAREA")
508
for key, val in attrs:
514
self._append_select_control({"__select": d})
516
def end_select(self):
518
if self._current_form is None:
519
raise ParseError("end of SELECT before start of FORM")
520
if self._select is None:
521
raise ParseError("end of SELECT before start")
523
if self._option is not None:
528
def start_optgroup(self, attrs):
530
if self._select is None:
531
raise ParseError("OPTGROUP outside of SELECT")
533
for key, val in attrs:
538
def end_optgroup(self):
540
if self._optgroup is None:
541
raise ParseError("end of OPTGROUP before start")
542
self._optgroup = None
544
def _start_option(self, attrs):
546
if self._select is None:
547
raise ParseError("OPTION outside of SELECT")
548
if self._option is not None:
552
for key, val in attrs:
556
self._option.update(d)
557
if (self._optgroup and self._optgroup.has_key("disabled") and
558
not self._option.has_key("disabled")):
559
self._option["disabled"] = None
561
def _end_option(self):
563
if self._option is None:
564
raise ParseError("end of OPTION before start")
566
contents = self._option.get("contents", "").strip()
567
self._option["contents"] = contents
568
if not self._option.has_key("value"):
569
self._option["value"] = contents
570
if not self._option.has_key("label"):
571
self._option["label"] = contents
572
# stuff dict of SELECT HTML attrs into a special private key
573
# (gets deleted again later)
574
self._option["__select"] = self._select
575
self._append_select_control(self._option)
578
def _append_select_control(self, attrs):
580
controls = self._current_form[2]
581
name = self._select.get("name")
582
controls.append(("select", name, attrs))
584
def start_textarea(self, attrs):
586
if self._current_form is None:
587
raise ParseError("start of TEXTAREA before start of FORM")
588
if self._textarea is not None:
589
raise ParseError("nested TEXTAREAs")
590
if self._select is not None:
591
raise ParseError("TEXTAREA inside SELECT")
593
for key, val in attrs:
599
def end_textarea(self):
601
if self._current_form is None:
602
raise ParseError("end of TEXTAREA before start of FORM")
603
if self._textarea is None:
604
raise ParseError("end of TEXTAREA before start")
605
controls = self._current_form[2]
606
name = self._textarea.get("name")
607
controls.append(("textarea", name, self._textarea))
608
self._textarea = None
610
def start_label(self, attrs):
612
if self._current_label:
615
for key, val in attrs:
617
taken = bool(d.get("for")) # empty id is invalid
621
self.labels.append(d)
622
self._current_label = d
626
label = self._current_label
628
# something is ugly in the HTML, but we're ignoring it
630
self._current_label = None
631
label["__text"] = label["__text"]
632
# if it is staying around, it is True in all cases
635
def _add_label(self, d):
637
if self._current_label is not None:
638
if self._current_label["__taken"]:
639
self.end_label() # be fuzzy
641
self._current_label["__taken"] = True
642
d["__label"] = self._current_label
644
def handle_data(self, data):
645
# according to http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1
646
# line break immediately after start tags or immediately before end
647
# tags must be ignored, but real browsers only ignore a line break
648
# after a start tag, so we'll do that.
649
if data[0:1] == '\n':
653
if self._option is not None:
654
# self._option is a dictionary of the OPTION element's HTML
655
# attributes, but it has two special keys, one of which is the
656
# special "contents" key contains text between OPTION tags (the
657
# other is the "__select" key: see the end_option method)
660
elif self._textarea is not None:
663
# not if within option or textarea
664
elif self._current_label is not None:
665
map = self._current_label
670
if not map.has_key(key):
673
map[key] = map[key] + data
675
def do_button(self, attrs):
677
if self._current_form is None:
678
raise ParseError("start of BUTTON before start of FORM")
680
d["type"] = "submit" # default
681
for key, val in attrs:
683
controls = self._current_form[2]
687
# we don't want to lose information, so use a type string that
688
# doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
689
# e.g. type for BUTTON/RESET is "resetbutton"
690
# (type for INPUT/RESET is "reset")
693
controls.append((type, name, d))
695
def do_input(self, attrs):
697
if self._current_form is None:
698
raise ParseError("start of INPUT before start of FORM")
700
d["type"] = "text" # default
701
for key, val in attrs:
703
controls = self._current_form[2]
708
controls.append((type, name, d))
710
def do_isindex(self, attrs):
712
if self._current_form is None:
713
raise ParseError("start of ISINDEX before start of FORM")
715
for key, val in attrs:
717
controls = self._current_form[2]
720
# isindex doesn't have type or name HTML attributes
721
controls.append(("isindex", None, d))
723
def handle_entityref(self, name):
725
self.handle_data(unescape(
726
'&%s;' % name, self._entitydefs, self._encoding))
728
def handle_charref(self, name):
730
self.handle_data(unescape_charref(name, self._encoding))
732
def unescape_attr(self, name):
734
return unescape(name, self._entitydefs, self._encoding)
736
def unescape_attrs(self, attrs):
739
for key, val in attrs.items():
742
except AttributeError:
743
escaped_attrs[key] = self.unescape_attr(val)
745
# e.g. "__select" -- yuck!
746
escaped_attrs[key] = self.unescape_attrs(val)
749
def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
750
def unknown_charref(self, ref): self.handle_data("&#%s;" % ref)
753
# HTMLParser.HTMLParser is recent, so live without it if it's not available
754
# (also, htmllib.HTMLParser is much more tolerant of bad HTML)
758
class XHTMLCompatibleFormParser:
759
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
760
raise ValueError("HTMLParser could not be imported")
762
class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
763
"""Good for XHTML, bad for tolerance of incorrect HTML."""
764
# thanks to Michael Howitz for this!
765
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
766
HTMLParser.HTMLParser.__init__(self)
767
_AbstractFormParser.__init__(self, entitydefs, encoding)
769
def start_option(self, attrs):
770
_AbstractFormParser._start_option(self, attrs)
772
def end_option(self):
773
_AbstractFormParser._end_option(self)
775
def handle_starttag(self, tag, attrs):
777
method = getattr(self, "start_" + tag)
778
except AttributeError:
780
method = getattr(self, "do_" + tag)
781
except AttributeError:
788
def handle_endtag(self, tag):
790
method = getattr(self, "end_" + tag)
791
except AttributeError:
796
def unescape(self, name):
797
# Use the entitydefs passed into constructor, not
798
# HTMLParser.HTMLParser's entitydefs.
799
return self.unescape_attr(name)
801
def unescape_attr_if_required(self, name):
802
return name # HTMLParser.HTMLParser already did it
803
def unescape_attrs_if_required(self, attrs):
807
# monkeypatch to fix http://www.python.org/sf/803422 :-(
808
sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
809
class _AbstractSgmllibParser(_AbstractFormParser):
810
def do_option(self, attrs):
811
_AbstractFormParser._start_option(self, attrs)
813
def unescape_attr_if_required(self, name):
814
return self.unescape_attr(name)
815
def unescape_attrs_if_required(self, attrs):
816
return self.unescape_attrs(attrs)
818
class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
819
"""Good for tolerance of incorrect HTML, bad for XHTML."""
820
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
821
sgmllib.SGMLParser.__init__(self)
822
_AbstractFormParser.__init__(self, entitydefs, encoding)
825
if sys.version_info[:2] < (2, 2):
826
raise ImportError # BeautifulSoup uses generators
831
class _AbstractBSFormParser(_AbstractSgmllibParser):
833
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
834
_AbstractFormParser.__init__(self, entitydefs, encoding)
835
self.bs_base_class.__init__(self)
836
def handle_data(self, data):
837
_AbstractFormParser.handle_data(self, data)
838
self.bs_base_class.handle_data(self, data)
840
class RobustFormParser(_AbstractBSFormParser, BeautifulSoup.BeautifulSoup):
841
"""Tries to be highly tolerant of incorrect HTML."""
842
bs_base_class = BeautifulSoup.BeautifulSoup
843
class NestingRobustFormParser(_AbstractBSFormParser,
844
BeautifulSoup.ICantBelieveItsBeautifulSoup):
845
"""Tries to be highly tolerant of incorrect HTML.
847
Different from RobustFormParser in that it more often guesses nesting
848
above missing end tags (see BeautifulSoup docs).
851
bs_base_class = BeautifulSoup.ICantBelieveItsBeautifulSoup
853
#FormParser = XHTMLCompatibleFormParser # testing hack
854
#FormParser = RobustFormParser # testing hack
856
def ParseResponse(response, select_default=False,
857
ignore_errors=False, # ignored!
858
form_parser_class=FormParser,
859
request_class=urllib2.Request,
861
backwards_compat=True,
862
encoding=DEFAULT_ENCODING,
864
"""Parse HTTP response and return a list of HTMLForm instances.
866
The return value of urllib2.urlopen can be conveniently passed to this
867
function as the response parameter.
869
ClientForm.ParseError is raised on parse errors.
871
response: file-like object (supporting read() method) with a method
872
geturl(), returning the URI of the HTTP response
873
select_default: for multiple-selection SELECT controls and RADIO controls,
874
pick the first item as the default if none are selected in the HTML
875
form_parser_class: class to instantiate and use to pass
876
request_class: class to return from .click() method (default is
878
entitydefs: mapping like {"&": "&", ...} containing HTML entity
879
definitions (a sensible default is used)
880
encoding: character encoding used for encoding numeric character references
881
when matching link text. ClientForm does not attempt to find the encoding
882
in a META HTTP-EQUIV attribute in the document itself (mechanize, for
883
example, does do that and will pass the correct value to ClientForm using
886
backwards_compat: boolean that determines whether the returned HTMLForm
887
objects are backwards-compatible with old code. If backwards_compat is
890
- ClientForm 0.1 code will continue to work as before.
892
- Label searches that do not specify a nr (number or count) will always
893
get the first match, even if other controls match. If
894
backwards_compat is False, label searches that have ambiguous results
895
will raise an AmbiguityError.
897
- Item label matching is done by strict string comparison rather than
900
- De-selecting individual list items is allowed even if the Item is
903
The backwards_compat argument will be deprecated in a future release.
905
Pass a true value for select_default if you want the behaviour specified by
906
RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
907
RADIO or multiple-selection SELECT control if none were selected in the
908
HTML. Most browsers (including Microsoft Internet Explorer (IE) and
909
Netscape Navigator) instead leave all items unselected in these cases. The
910
W3C HTML 4.0 standard leaves this behaviour undefined in the case of
911
multiple-selection SELECT controls, but insists that at least one RADIO
912
button should be checked at all times, in contradiction to browser
915
There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
916
HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
917
sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
918
Note that HTMLParser is only available in Python 2.2 and later. You can
919
pass your own class in here as a hack to work around bad HTML, but at your
920
own risk: there is no well-defined interface.
923
return ParseFile(response, response.geturl(), select_default,
932
def ParseFile(file, base_uri, select_default=False,
933
ignore_errors=False, # ignored!
934
form_parser_class=FormParser,
935
request_class=urllib2.Request,
937
backwards_compat=True,
938
encoding=DEFAULT_ENCODING,
940
"""Parse HTML and return a list of HTMLForm instances.
942
ClientForm.ParseError is raised on parse errors.
944
file: file-like object (supporting read() method) containing HTML with zero
945
or more forms to be parsed
946
base_uri: the URI of the document (note that the base URI used to submit
947
the form will be that given in the BASE element if present, not that of
950
For the other arguments and further details, see ParseResponse.__doc__.
954
deprecation("operating in backwards-compatibility mode")
955
fp = form_parser_class(entitydefs, encoding)
957
data = file.read(CHUNK)
960
except ParseError, e:
961
e.base_uri = base_uri
963
if len(data) != CHUNK: break
964
if fp.base is not None:
965
# HTML BASE element takes precedence over document URI
967
labels = [] # Label(label) for label in fp.labels]
973
coll = id_to_labels.get(for_id)
975
id_to_labels[for_id] = [label]
979
for (name, action, method, enctype), attrs, controls in fp.forms:
983
action = urljoin(base_uri, action)
984
action = fp.unescape_attr_if_required(action)
985
name = fp.unescape_attr_if_required(name)
986
attrs = fp.unescape_attrs_if_required(attrs)
987
# would be nice to make HTMLForm class (form builder) pluggable
989
action, method, enctype, name, attrs, request_class,
990
forms, labels, id_to_labels, backwards_compat)
991
for ii in range(len(controls)):
992
type, name, attrs = controls[ii]
993
attrs = fp.unescape_attrs_if_required(attrs)
994
name = fp.unescape_attr_if_required(name)
995
# index=ii*10 allows ImageControl to return multiple ordered pairs
996
form.new_control(type, name, attrs, select_default=select_default,
1005
def __init__(self, attrs):
1006
self.id = attrs.get("for")
1007
self._text = attrs.get("__text").strip()
1008
self._ctext = compress_text(self._text)
1010
self._backwards_compat = False # maintained by HTMLForm
1012
def __getattr__(self, name):
1014
if self._backwards_compat:
1018
return getattr(Label, name)
1020
def __setattr__(self, name, value):
1022
# don't see any need for this, so make it read-only
1023
raise AttributeError("text attribute is read-only")
1024
self.__dict__[name] = value
1027
return "<Label(id=%r, text=%r)>" % (self.id, self.text)
1030
def _get_label(attrs):
1031
text = attrs.get("__label")
1032
if text is not None:
1038
"""An HTML form control.
1040
An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm
1041
are accessed using the HTMLForm.find_control method or the
1042
HTMLForm.controls attribute.
1044
Control instances are usually constructed using the ParseFile /
1045
ParseResponse functions. If you use those functions, you can ignore the
1046
rest of this paragraph. A Control is only properly initialised after the
1047
fixup method has been called. In fact, this is only strictly necessary for
1048
ListControl instances. This is necessary because ListControls are built up
1049
from ListControls each containing only a single item, and their initial
1050
value(s) can only be known after the sequence is complete.
1052
The types and values that are acceptable for assignment to the value
1053
attribute are defined by subclasses.
1055
If the disabled attribute is true, this represents the state typically
1056
represented by browsers by 'greying out' a control. If the disabled
1057
attribute is true, the Control will raise AttributeError if an attempt is
1058
made to change its value. In addition, the control will not be considered
1059
'successful' as defined by the W3C HTML 4 standard -- ie. it will
1060
contribute no data to the return value of the HTMLForm.click* methods. To
1061
enable a control, set the disabled attribute to a false value.
1063
If the readonly attribute is true, the Control will raise AttributeError if
1064
an attempt is made to change its value. To make a control writable, set
1065
the readonly attribute to a false value.
1067
All controls have the disabled and readonly attributes, not only those that
1068
may have the HTML attributes of the same names.
1070
On assignment to the value attribute, the following exceptions are raised:
1071
TypeError, AttributeError (if the value attribute should not be assigned
1072
to, because the control is disabled, for example) and ValueError.
1074
If the name or value attributes are None, or the value is an empty list, or
1075
if the control is disabled, the control is not successful.
1079
type: string describing type of control (see the keys of the
1080
HTMLForm.type2class dictionary for the allowable values) (readonly)
1081
name: name of control (readonly)
1082
value: current value of control (subclasses may allow a single value, a
1083
sequence of values, or either)
1084
disabled: disabled state
1085
readonly: readonly state
1086
id: value of id HTML attribute
1089
def __init__(self, type, name, attrs, index=None):
1091
type: string describing type of control (see the keys of the
1092
HTMLForm.type2class dictionary for the allowable values)
1094
attrs: HTML attributes of control's HTML element
1097
raise NotImplementedError()
1099
def add_to_form(self, form):
1101
form.controls.append(self)
1106
def is_of_kind(self, kind):
1107
raise NotImplementedError()
1110
raise NotImplementedError()
1112
def __getattr__(self, name): raise NotImplementedError()
1113
def __setattr__(self, name, value): raise NotImplementedError()
1116
"""Return list of (key, value) pairs suitable for passing to urlencode.
1118
return [(k, v) for (i, k, v) in self._totally_ordered_pairs()]
1120
def _totally_ordered_pairs(self):
1121
"""Return list of (key, value, index) tuples.
1123
Like pairs, but allows preserving correct ordering even where several
1124
controls are involved.
1127
raise NotImplementedError()
1129
def _write_mime_data(self, mw, name, value):
1130
"""Write data for a subitem of this control to a MimeWriter."""
1131
# called by HTMLForm
1133
mw2.addheader("Content-disposition",
1134
'form-data; name="%s"' % name, 1)
1135
f = mw2.startbody(prefix=0)
1139
raise NotImplementedError()
1141
def get_labels(self):
1142
"""Return all labels (Label instances) for this control.
1144
If the control was surrounded by a <label> tag, that will be the first
1145
label; all other labels, connected by 'for' and 'id', are in the order
1146
that appear in the HTML.
1151
res.append(self._label)
1153
res.extend(self._form._id_to_labels.get(self.id, ()))
1157
#---------------------------------------------------
1158
class ScalarControl(Control):
1159
"""Control whose value is not restricted to one of a prescribed set.
1161
Some ScalarControls don't accept any value attribute. Otherwise, takes a
1162
single value, which must be string-like.
1164
Additional read-only public attribute:
1166
attrs: dictionary mapping the names of original HTML attributes of the
1167
control to their values
1170
def __init__(self, type, name, attrs, index=None):
1172
self._label = _get_label(attrs)
1173
self.__dict__["type"] = type.lower()
1174
self.__dict__["name"] = name
1175
self._value = attrs.get("value")
1176
self.disabled = attrs.has_key("disabled")
1177
self.readonly = attrs.has_key("readonly")
1178
self.id = attrs.get("id")
1180
self.attrs = attrs.copy()
1182
self._clicked = False
1184
def __getattr__(self, name):
1186
return self.__dict__["_value"]
1188
raise AttributeError("%s instance has no attribute '%s'" %
1189
(self.__class__.__name__, name))
1191
def __setattr__(self, name, value):
1193
if not isstringlike(value):
1194
raise TypeError("must assign a string")
1196
raise AttributeError("control '%s' is readonly" % self.name)
1198
raise AttributeError("control '%s' is disabled" % self.name)
1199
self.__dict__["_value"] = value
1200
elif name in ("name", "type"):
1201
raise AttributeError("%s attribute is readonly" % name)
1203
self.__dict__[name] = value
1205
def _totally_ordered_pairs(self):
1208
if name is None or value is None or self.disabled:
1210
return [(self._index, name, value)]
1214
raise AttributeError("control '%s' is readonly" % self.name)
1215
self.__dict__["_value"] = None
1220
if name is None: name = "<None>"
1221
if value is None: value = "<None>"
1224
if self.disabled: infos.append("disabled")
1225
if self.readonly: infos.append("readonly")
1226
info = ", ".join(infos)
1227
if info: info = " (%s)" % info
1229
return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
1232
#---------------------------------------------------
1233
class TextControl(ScalarControl):
1234
"""Textual input control.
1244
def __init__(self, type, name, attrs, index=None):
1245
ScalarControl.__init__(self, type, name, attrs, index)
1246
if self.type == "hidden": self.readonly = True
1247
if self._value is None:
1250
def is_of_kind(self, kind): return kind == "text"
1252
#---------------------------------------------------
1253
class FileControl(ScalarControl):
1254
"""File upload with INPUT TYPE=FILE.
1256
The value attribute of a FileControl is always None. Use add_file instead.
1258
Additional public method: add_file
1262
def __init__(self, type, name, attrs, index=None):
1263
ScalarControl.__init__(self, type, name, attrs, index)
1265
self._upload_data = []
1267
def is_of_kind(self, kind): return kind == "file"
1271
raise AttributeError("control '%s' is readonly" % self.name)
1272
self._upload_data = []
1274
def __setattr__(self, name, value):
1275
if name in ("value", "name", "type"):
1276
raise AttributeError("%s attribute is readonly" % name)
1278
self.__dict__[name] = value
1280
def add_file(self, file_object, content_type=None, filename=None):
1281
if not hasattr(file_object, "read"):
1282
raise TypeError("file-like object must have read method")
1283
if content_type is not None and not isstringlike(content_type):
1284
raise TypeError("content type must be None or string-like")
1285
if filename is not None and not isstringlike(filename):
1286
raise TypeError("filename must be None or string-like")
1287
if content_type is None:
1288
content_type = "application/octet-stream"
1289
self._upload_data.append((file_object, content_type, filename))
1291
def _totally_ordered_pairs(self):
1292
# XXX should it be successful even if unnamed?
1293
if self.name is None or self.disabled:
1295
return [(self._index, self.name, "")]
1297
def _write_mime_data(self, mw, _name, _value):
1298
# called by HTMLForm
1299
# assert _name == self.name and _value == ''
1300
if len(self._upload_data) == 1:
1302
file_object, content_type, filename = self._upload_data[0]
1304
fn_part = filename and ('; filename="%s"' % filename) or ""
1305
disp = 'form-data; name="%s"%s' % (self.name, fn_part)
1306
mw2.addheader("Content-disposition", disp, prefix=1)
1307
fh = mw2.startbody(content_type, prefix=0)
1308
fh.write(file_object.read())
1309
elif len(self._upload_data) != 0:
1312
disp = 'form-data; name="%s"' % self.name
1313
mw2.addheader("Content-disposition", disp, prefix=1)
1314
fh = mw2.startmultipartbody("mixed", prefix=0)
1315
for file_object, content_type, filename in self._upload_data:
1316
mw3 = mw2.nextpart()
1317
fn_part = filename and ('; filename="%s"' % filename) or ""
1318
disp = "file%s" % fn_part
1319
mw3.addheader("Content-disposition", disp, prefix=1)
1320
fh2 = mw3.startbody(content_type, prefix=0)
1321
fh2.write(file_object.read())
1326
if name is None: name = "<None>"
1328
if not self._upload_data:
1329
value = "<No files added>"
1332
for file, ctype, filename in self._upload_data:
1333
if filename is None:
1334
value.append("<Unnamed file>")
1336
value.append(filename)
1337
value = ", ".join(value)
1340
if self.disabled: info.append("disabled")
1341
if self.readonly: info.append("readonly")
1342
info = ", ".join(info)
1343
if info: info = " (%s)" % info
1345
return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
1348
#---------------------------------------------------
1349
class IsindexControl(ScalarControl):
1352
ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really
1353
part of regular HTML forms at all, and predates it. You're only allowed
1354
one ISINDEX per HTML document. ISINDEX and regular form submission are
1355
mutually exclusive -- either submit a form, or the ISINDEX.
1357
Having said this, since ISINDEX controls may appear in forms (which is
1358
probably bad HTML), ParseFile / ParseResponse will include them in the
1359
HTMLForm instances it returns. You can set the ISINDEX's value, as with
1360
any other control (but note that ISINDEX controls have no name, so you'll
1361
need to use the type argument of set_value!). When you submit the form,
1362
the ISINDEX will not be successful (ie., no data will get returned to the
1363
server as a result of its presence), unless you click on the ISINDEX
1364
control, in which case the ISINDEX gets submitted instead of the form:
1366
form.set_value("my isindex value", type="isindex")
1367
urllib2.urlopen(form.click(type="isindex"))
1369
ISINDEX elements outside of FORMs are ignored. If you want to submit one
1370
by hand, do it like so:
1372
url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value"))
1373
result = urllib2.urlopen(url)
1376
def __init__(self, type, name, attrs, index=None):
1377
ScalarControl.__init__(self, type, name, attrs, index)
1378
if self._value is None:
1381
def is_of_kind(self, kind): return kind in ["text", "clickable"]
1383
def _totally_ordered_pairs(self):
1386
def _click(self, form, coord, return_type, request_class=urllib2.Request):
1387
# Relative URL for ISINDEX submission: instead of "foo=bar+baz",
1389
# This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is
1390
# deprecated in 4.01, but it should still say how to submit it).
1391
# Submission of ISINDEX is explained in the HTML 3.2 spec, though.
1392
parts = urlparse.urlparse(form.action)
1393
rest, (query, frag) = parts[:-2], parts[-2:]
1394
parts = rest + (urllib.quote_plus(self.value), "")
1395
url = urlparse.urlunparse(parts)
1396
req_data = url, None, []
1398
if return_type == "pairs":
1400
elif return_type == "request_data":
1403
return request_class(url)
1407
if value is None: value = "<None>"
1410
if self.disabled: infos.append("disabled")
1411
if self.readonly: infos.append("readonly")
1412
info = ", ".join(infos)
1413
if info: info = " (%s)" % info
1415
return "<%s(%s)%s>" % (self.__class__.__name__, value, info)
1418
#---------------------------------------------------
1419
class IgnoreControl(ScalarControl):
1420
"""Control that we're not interested in.
1429
These controls are always unsuccessful, in the terminology of HTML 4 (ie.
1430
they never require any information to be returned to the server).
1432
BUTTON/BUTTON is used to generate events for script embedded in HTML.
1434
The value attribute of IgnoreControl is always None.
1437
def __init__(self, type, name, attrs, index=None):
1438
ScalarControl.__init__(self, type, name, attrs, index)
1441
def is_of_kind(self, kind): return False
1443
def __setattr__(self, name, value):
1445
raise AttributeError(
1446
"control '%s' is ignored, hence read-only" % self.name)
1447
elif name in ("name", "type"):
1448
raise AttributeError("%s attribute is readonly" % name)
1450
self.__dict__[name] = value
1453
#---------------------------------------------------
1456
# helpers and subsidiary classes
1459
def __init__(self, control, attrs, index=None):
1460
label = _get_label(attrs)
1461
self.__dict__.update({
1462
"name": attrs["value"],
1463
"_labels": label and [label] or [],
1465
"_control": control,
1466
"disabled": attrs.has_key("disabled"),
1468
"id": attrs.get("id"),
1471
control.items.append(self)
1473
def get_labels(self):
1474
"""Return all labels (Label instances) for this item.
1476
For items that represent radio buttons or checkboxes, if the item was
1477
surrounded by a <label> tag, that will be the first label; all other
1478
labels, connected by 'for' and 'id', are in the order that appear in
1481
For items that represent select options, if the option had a label
1482
attribute, that will be the first label. If the option has contents
1483
(text within the option tags) and it is not the same as the label
1484
attribute (if any), that will be a label. There is nothing in the
1485
spec to my knowledge that makes an option with an id unable to be the
1486
target of a label's for attribute, so those are included, if any, for
1487
the sake of consistency and completeness.
1491
res.extend(self._labels)
1493
res.extend(self._control._form._id_to_labels.get(self.id, ()))
1496
def __getattr__(self, name):
1497
if name=="selected":
1498
return self._selected
1499
raise AttributeError(name)
1501
def __setattr__(self, name, value):
1502
if name == "selected":
1503
self._control._set_selected_state(self, value)
1504
elif name == "disabled":
1505
self.__dict__["disabled"] = bool(value)
1507
raise AttributeError(name)
1518
attrs = [("name", self.name), ("id", self.id)]+self.attrs.items()
1519
return "<%s %s>" % (
1520
self.__class__.__name__,
1521
" ".join(["%s=%r" % (k, v) for k, v in attrs])
1524
def disambiguate(items, nr, **kwds):
1526
for key, value in kwds.items():
1527
msgs.append("%s=%r" % (key, value))
1528
msg = " ".join(msgs)
1530
raise ItemNotFoundError(msg)
1533
raise AmbiguityError(msg)
1535
if len(items) <= nr:
1536
raise ItemNotFoundError(msg)
1539
class ListControl(Control):
1540
"""Control representing a sequence of items.
1542
The value attribute of a ListControl represents the successful list items
1543
in the control. The successful list items are those that are selected and
1546
ListControl implements both list controls that take a length-1 value
1547
(single-selection) and those that take length >1 values
1548
(multiple-selection).
1550
ListControls accept sequence values only. Some controls only accept
1551
sequences of length 0 or 1 (RADIO, and single-selection SELECT).
1552
In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes
1553
and multiple-selection SELECTs (those having the "multiple" HTML attribute)
1554
accept sequences of any length.
1556
Note the following mistake:
1558
control.value = some_value
1559
assert control.value == some_value # not necessarily true
1561
The reason for this is that the value attribute always gives the list items
1562
in the order they were listed in the HTML.
1564
ListControl items can also be referred to by their labels instead of names.
1565
Use the label argument to .get(), and the .set_value_by_label(),
1566
.get_value_by_label() methods.
1568
Note that, rather confusingly, though SELECT controls are represented in
1569
HTML by SELECT elements (which contain OPTION elements, representing
1570
individual list items), CHECKBOXes and RADIOs are not represented by *any*
1571
element. Instead, those controls are represented by a collection of INPUT
1572
elements. For example, this is a SELECT control, named "control1":
1574
<select name="control1">
1575
<option>foo</option>
1576
<option value="1">bar</option>
1579
and this is a CHECKBOX control, named "control2":
1581
<input type="checkbox" name="control2" value="foo" id="cbe1">
1582
<input type="checkbox" name="control2" value="bar" id="cbe2">
1584
The id attribute of a CHECKBOX or RADIO ListControl is always that of its
1585
first element (for example, "cbe1" above).
1588
Additional read-only public attribute: multiple.
1592
# ListControls are built up by the parser from their component items by
1593
# creating one ListControl per item, consolidating them into a single
1594
# master ListControl held by the HTMLForm:
1596
# -User calls form.new_control(...)
1597
# -Form creates Control, and calls control.add_to_form(self).
1598
# -Control looks for a Control with the same name and type in the form,
1599
# and if it finds one, merges itself with that control by calling
1600
# control.merge_control(self). The first Control added to the form, of
1601
# a particular name and type, is the only one that survives in the
1603
# -Form calls control.fixup for all its controls. ListControls in the
1604
# form know they can now safely pick their default values.
1606
# To create a ListControl without an HTMLForm, use:
1608
# control.merge_control(new_control)
1610
# (actually, it's much easier just to use ParseFile)
1614
def __init__(self, type, name, attrs={}, select_default=False,
1615
called_as_base_class=False, index=None):
1617
select_default: for RADIO and multiple-selection SELECT controls, pick
1618
the first item as the default if no 'selected' HTML attribute is
1622
if not called_as_base_class:
1623
raise NotImplementedError()
1625
self.__dict__["type"] = type.lower()
1626
self.__dict__["name"] = name
1627
self._value = attrs.get("value")
1628
self.disabled = False
1629
self.readonly = False
1630
self.id = attrs.get("id")
1632
# As Controls are merged in with .merge_control(), self.attrs will
1633
# refer to each Control in turn -- always the most recently merged
1634
# control. Each merged-in Control instance corresponds to a single
1635
# list item: see ListControl.__doc__.
1639
self._select_default = select_default
1640
self._clicked = False
1645
def is_of_kind(self, kind):
1648
elif kind == "multilist":
1649
return bool(self.multiple)
1650
elif kind == "singlelist":
1651
return not self.multiple
1655
def get_items(self, name=None, label=None, id=None,
1656
exclude_disabled=False):
1657
"""Return matching items by name or label.
1659
For argument docs, see the docstring for .get()
1662
if name is not None and not isstringlike(name):
1663
raise TypeError("item name must be string-like")
1664
if label is not None and not isstringlike(label):
1665
raise TypeError("item label must be string-like")
1666
if id is not None and not isstringlike(id):
1667
raise TypeError("item id must be string-like")
1668
items = [] # order is important
1669
compat = self._form.backwards_compat
1670
for o in self.items:
1671
if exclude_disabled and o.disabled:
1673
if name is not None and o.name != name:
1675
if label is not None:
1676
for l in o.get_labels():
1677
if ((compat and l.text == label) or
1678
(not compat and l.text.find(label) > -1)):
1682
if id is not None and o.id != id:
1687
def get(self, name=None, label=None, id=None, nr=None,
1688
exclude_disabled=False):
1689
"""Return item by name or label, disambiguating if necessary with nr.
1691
All arguments must be passed by name, with the exception of 'name',
1692
which may be used as a positional argument.
1694
If name is specified, then the item must have the indicated name.
1696
If label is specified, then the item must have a label whose
1697
whitespace-compressed, stripped, text substring-matches the indicated
1698
label string (eg. label="please choose" will match
1699
" Do please choose an item ").
1701
If id is specified, then the item must have the indicated id.
1703
nr is an optional 0-based index of the items matching the query.
1705
If nr is the default None value and more than item is found, raises
1706
AmbiguityError (unless the HTMLForm instance's backwards_compat
1709
If no item is found, or if items are found but nr is specified and not
1710
found, raises ItemNotFoundError.
1712
Optionally excludes disabled items.
1715
if nr is None and self._form.backwards_compat:
1717
items = self.get_items(name, label, id, exclude_disabled)
1718
return disambiguate(items, nr, name=name, label=label, id=id)
1720
def _get(self, name, by_label=False, nr=None, exclude_disabled=False):
1721
# strictly for use by deprecated methods
1723
name, label = None, name
1725
name, label = name, None
1726
return self.get(name, label, nr, exclude_disabled)
1728
def toggle(self, name, by_label=False, nr=None):
1729
"""Deprecated: given a name or label and optional disambiguating index
1730
nr, toggle the matching item's selection.
1732
Selecting items follows the behavior described in the docstring of the
1735
if the item is disabled, or this control is disabled or readonly,
1736
raise AttributeError.
1740
"item = control.get(...); item.selected = not item.selected")
1741
o = self._get(name, by_label, nr)
1742
self._set_selected_state(o, not o.selected)
1744
def set(self, selected, name, by_label=False, nr=None):
1745
"""Deprecated: given a name or label and optional disambiguating index
1746
nr, set the matching item's selection to the bool value of selected.
1748
Selecting items follows the behavior described in the docstring of the
1751
if the item is disabled, or this control is disabled or readonly,
1752
raise AttributeError.
1756
"control.get(...).selected = <boolean>")
1757
self._set_selected_state(self._get(name, by_label, nr), selected)
1759
def _set_selected_state(self, item, action):
1764
raise AttributeError("control '%s' is disabled" % self.name)
1766
raise AttributeError("control '%s' is readonly" % self.name)
1767
action == bool(action)
1768
compat = self._form.backwards_compat
1769
if not compat and item.disabled:
1770
raise AttributeError("item is disabled")
1772
if compat and item.disabled and action:
1773
raise AttributeError("item is disabled")
1775
item.__dict__["_selected"] = action
1778
item.__dict__["_selected"] = False
1780
for o in self.items:
1781
o.__dict__["_selected"] = False
1782
item.__dict__["_selected"] = True
1784
def toggle_single(self, by_label=None):
1785
"""Deprecated: toggle the selection of the single item in this control.
1787
Raises ItemCountError if the control does not contain only one item.
1789
by_label argument is ignored, and included only for backwards
1794
"control.items[0].selected = not control.items[0].selected")
1795
if len(self.items) != 1:
1796
raise ItemCountError(
1797
"'%s' is not a single-item control" % self.name)
1798
item = self.items[0]
1799
self._set_selected_state(item, not item.selected)
1801
def set_single(self, selected, by_label=None):
1802
"""Deprecated: set the selection of the single item in this control.
1804
Raises ItemCountError if the control does not contain only one item.
1806
by_label argument is ignored, and included only for backwards
1811
"control.items[0].selected = <boolean>")
1812
if len(self.items) != 1:
1813
raise ItemCountError(
1814
"'%s' is not a single-item control" % self.name)
1815
self._set_selected_state(self.items[0], selected)
1817
def get_item_disabled(self, name, by_label=False, nr=None):
1818
"""Get disabled state of named list item in a ListControl."""
1820
"control.get(...).disabled")
1821
return self._get(name, by_label, nr).disabled
1823
def set_item_disabled(self, disabled, name, by_label=False, nr=None):
1824
"""Set disabled state of named list item in a ListControl.
1826
disabled: boolean disabled state
1830
"control.get(...).disabled = <boolean>")
1831
self._get(name, by_label, nr).disabled = disabled
1833
def set_all_items_disabled(self, disabled):
1834
"""Set disabled state of all list items in a ListControl.
1836
disabled: boolean disabled state
1839
for o in self.items:
1840
o.disabled = disabled
1842
def get_item_attrs(self, name, by_label=False, nr=None):
1843
"""Return dictionary of HTML attributes for a single ListControl item.
1845
The HTML element types that describe list items are: OPTION for SELECT
1846
controls, INPUT for the rest. These elements have HTML attributes that
1847
you may occasionally want to know about -- for example, the "alt" HTML
1848
attribute gives a text string describing the item (graphical browsers
1849
usually display this as a tooltip).
1851
The returned dictionary maps HTML attribute names to values. The names
1852
and values are taken from the original HTML.
1856
"control.get(...).attrs")
1857
return self._get(name, by_label, nr).attrs
1859
def add_to_form(self, form):
1860
assert self._form is None or form == self._form, (
1861
"can't add control to more than one form")
1864
control = form.find_control(self.name, self.type)
1865
except ControlNotFoundError:
1866
Control.add_to_form(self, form)
1868
control.merge_control(self)
1870
def merge_control(self, control):
1871
assert bool(control.multiple) == bool(self.multiple)
1872
# usually, isinstance(control, self.__class__)
1873
self.items.extend(control.items)
1877
ListControls are built up from component list items (which are also
1878
ListControls) during parsing. This method should be called after all
1879
items have been added. See ListControl.__doc__ for the reason this is
1883
# Need to set default selection where no item was indicated as being
1884
# selected by the HTML:
1887
# Nothing should be selected.
1888
# SELECT/single, SELECT/multiple and RADIO:
1889
# RFC 1866 (HTML 2.0): says first item should be selected.
1890
# W3C HTML 4.01 Specification: says that client behaviour is
1891
# undefined in this case. For RADIO, exactly one must be selected,
1892
# though which one is undefined.
1893
# Both Netscape and Microsoft Internet Explorer (IE) choose first
1894
# item for SELECT/single. However, both IE5 and Mozilla (both 1.0
1895
# and Firebird 0.6) leave all items unselected for RADIO and
1898
# Since both Netscape and IE all choose the first item for
1899
# SELECT/single, we do the same. OTOH, both Netscape and IE
1900
# leave SELECT/multiple with nothing selected, in violation of RFC 1866
1901
# (but not in violation of the W3C HTML 4 standard); the same is true
1902
# of RADIO (which *is* in violation of the HTML 4 standard). We follow
1903
# RFC 1866 if the _select_default attribute is set, and Netscape and IE
1904
# otherwise. RFC 1866 and HTML 4 are always violated insofar as you
1905
# can deselect all items in a RadioControl.
1907
for o in self.items:
1908
# set items' controls to self, now that we've merged
1909
o.__dict__["_control"] = self
1911
def __getattr__(self, name):
1913
compat = self._form.backwards_compat
1914
return [o.name for o in self.items if o.selected and
1915
(not o.disabled or compat)]
1917
raise AttributeError("%s instance has no attribute '%s'" %
1918
(self.__class__.__name__, name))
1920
def __setattr__(self, name, value):
1923
raise AttributeError("control '%s' is disabled" % self.name)
1925
raise AttributeError("control '%s' is readonly" % self.name)
1926
self._set_value(value)
1927
elif name in ("name", "type", "multiple"):
1928
raise AttributeError("%s attribute is readonly" % name)
1930
self.__dict__[name] = value
1932
def _set_value(self, value):
1933
if value is None or isstringlike(value):
1934
raise TypeError("ListControl, must set a sequence")
1936
compat = self._form.backwards_compat
1937
for o in self.items:
1938
if not o.disabled or compat:
1941
self._multiple_set_value(value)
1942
elif len(value) > 1:
1943
raise ItemCountError(
1944
"single selection list, must set sequence of "
1947
self._single_set_value(value)
1949
def _get_items(self, name, target=1):
1950
all_items = self.get_items(name)
1951
items = [o for o in all_items if not o.disabled]
1952
if len(items) < target:
1953
if len(all_items) < target:
1954
raise ItemNotFoundError(
1955
"insufficient items with name %r" % name)
1957
raise AttributeError(
1958
"insufficient non-disabled items with name %s" % name)
1968
def _single_set_value(self, value):
1969
assert len(value) == 1
1970
on, off = self._get_items(value[0])
1973
off[0].selected = True
1975
def _multiple_set_value(self, value):
1976
compat = self._form.backwards_compat
1977
turn_on = [] # transactional-ish
1978
turn_off = [item for item in self.items if
1979
item.selected and (not item.disabled or compat)]
1982
if nn in names.keys():
1986
for name, count in names.items():
1987
on, off = self._get_items(name, count)
1988
for i in range(count):
1992
del turn_off[turn_off.index(item)]
1996
turn_on.append(item)
1997
for item in turn_off:
1998
item.selected = False
1999
for item in turn_on:
2000
item.selected = True
2002
def set_value_by_label(self, value):
2003
"""Set the value of control by item labels.
2005
value is expected to be an iterable of strings that are substrings of
2006
the item labels that should be selected. Before substring matching is
2007
performed, the original label text is whitespace-compressed
2008
(consecutive whitespace characters are converted to a single space
2009
character) and leading and trailing whitespace is stripped. Ambiguous
2010
labels are accepted without complaint if the form's backwards_compat is
2011
True; otherwise, it will not complain as long as all ambiguous labels
2012
share the same item name (e.g. OPTION value).
2015
if isstringlike(value):
2016
raise TypeError(value)
2017
if not self.multiple and len(value) > 1:
2018
raise ItemCountError(
2019
"single selection list, must set sequence of "
2023
found = self.get_items(label=nn)
2025
if not self._form.backwards_compat:
2026
# ambiguous labels are fine as long as item names (e.g.
2027
# OPTION values) are same
2028
opt_name = found[0].name
2029
if [o for o in found[1:] if o.name != opt_name]:
2030
raise AmbiguityError(nn)
2032
# OK, we'll guess :-( Assume first available item.
2035
# For the multiple-item case, we could try to be smarter,
2036
# saving them up and trying to resolve, but that's too much.
2037
if self._form.backwards_compat or o not in items:
2040
else: # all of them are used
2041
raise ItemNotFoundError(nn)
2042
# now we have all the items that should be on
2043
# let's just turn everything off and then back on.
2048
def get_value_by_label(self):
2049
"""Return the value of the control as given by normalized labels."""
2051
compat = self._form.backwards_compat
2052
for o in self.items:
2053
if (not o.disabled or compat) and o.selected:
2054
for l in o.get_labels():
2062
def possible_items(self, by_label=False):
2063
"""Deprecated: return the names or labels of all possible items.
2065
Includes disabled items, which may be misleading for some use cases.
2069
"[item.name for item in self.items]")
2072
for o in self.items:
2073
for l in o.get_labels():
2080
return [o.name for o in self.items]
2082
def _totally_ordered_pairs(self):
2086
return [(o._index, self.name, o.name) for o in self.items
2087
if o.selected and not o.disabled]
2091
if name is None: name = "<None>"
2093
display = [str(o) for o in self.items]
2096
if self.disabled: infos.append("disabled")
2097
if self.readonly: infos.append("readonly")
2098
info = ", ".join(infos)
2099
if info: info = " (%s)" % info
2101
return "<%s(%s=[%s])%s>" % (self.__class__.__name__,
2102
name, ", ".join(display), info)
2105
class RadioControl(ListControl):
2112
def __init__(self, type, name, attrs, select_default=False, index=None):
2113
attrs.setdefault("value", "on")
2114
ListControl.__init__(self, type, name, attrs, select_default,
2115
called_as_base_class=True, index=index)
2116
self.__dict__["multiple"] = False
2117
o = Item(self, attrs, index)
2118
o.__dict__["_selected"] = attrs.has_key("checked")
2121
ListControl.fixup(self)
2122
found = [o for o in self.items if o.selected and not o.disabled]
2124
if self._select_default:
2125
for o in self.items:
2130
# Ensure only one item selected. Choose the last one,
2131
# following IE and Firefox.
2132
for o in found[:-1]:
2135
def get_labels(self):
2138
class CheckboxControl(ListControl):
2145
def __init__(self, type, name, attrs, select_default=False, index=None):
2146
attrs.setdefault("value", "on")
2147
ListControl.__init__(self, type, name, attrs, select_default,
2148
called_as_base_class=True, index=index)
2149
self.__dict__["multiple"] = True
2150
o = Item(self, attrs, index)
2151
o.__dict__["_selected"] = attrs.has_key("checked")
2153
def get_labels(self):
2157
class SelectControl(ListControl):
2164
OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
2167
OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
2169
SELECT control values and labels are subject to some messy defaulting
2170
rules. For example, if the HTML representation of the control is:
2173
<OPTION value=0 label="2002">current year</OPTION>
2174
<OPTION value=1>2001</OPTION>
2175
<OPTION>2000</OPTION>
2178
The items, in order, have labels "2002", "2001" and "2000", whereas their
2179
names (the OPTION values) are "0", "1" and "2000" respectively. Note that
2180
the value of the last OPTION in this example defaults to its contents, as
2181
specified by RFC 1866, as do the labels of the second and third OPTIONs.
2183
The OPTION labels are sometimes more meaningful than the OPTION values,
2184
which can make for more maintainable code.
2186
Additional read-only public attribute: attrs
2188
The attrs attribute is a dictionary of the original HTML attributes of the
2189
SELECT element. Other ListControls do not have this attribute, because in
2190
other cases the control as a whole does not correspond to any single HTML
2191
element. control.get(...).attrs may be used as usual to get at the HTML
2192
attributes of the HTML elements corresponding to individual list items (for
2193
SELECT controls, these are OPTION elements).
2195
Another special case is that the Item.attrs dictionaries have a special key
2196
"contents" which does not correspond to any real HTML attribute, but rather
2197
contains the contents of the OPTION element:
2199
<OPTION>this bit</OPTION>
2202
# HTML attributes here are treated slightly differently from other list
2204
# -The SELECT HTML attributes dictionary is stuffed into the OPTION
2205
# HTML attributes dictionary under the "__select" key.
2206
# -The content of each OPTION element is stored under the special
2207
# "contents" key of the dictionary.
2208
# After all this, the dictionary is passed to the SelectControl constructor
2209
# as the attrs argument, as usual. However:
2210
# -The first SelectControl constructed when building up a SELECT control
2211
# has a constructor attrs argument containing only the __select key -- so
2212
# this SelectControl represents an empty SELECT control.
2213
# -Subsequent SelectControls have both OPTION HTML-attribute in attrs and
2214
# the __select dictionary containing the SELECT HTML-attributes.
2216
def __init__(self, type, name, attrs, select_default=False, index=None):
2217
# fish out the SELECT HTML attributes from the OPTION HTML attributes
2219
self.attrs = attrs["__select"].copy()
2220
self.__dict__["_label"] = _get_label(self.attrs)
2221
self.__dict__["id"] = self.attrs.get("id")
2222
self.__dict__["multiple"] = self.attrs.has_key("multiple")
2223
# the majority of the contents, label, and value dance already happened
2224
contents = attrs.get("contents")
2225
attrs = attrs.copy()
2226
del attrs["__select"]
2228
ListControl.__init__(self, type, name, self.attrs, select_default,
2229
called_as_base_class=True, index=index)
2230
self.disabled = self.attrs.has_key("disabled")
2231
self.readonly = self.attrs.has_key("readonly")
2232
if attrs.has_key("value"):
2233
# otherwise it is a marker 'select started' token
2234
o = Item(self, attrs, index)
2235
o.__dict__["_selected"] = attrs.has_key("selected")
2236
# add 'label' label and contents label, if different. If both are
2237
# provided, the 'label' label is used for display in HTML
2238
# 4.0-compliant browsers (and any lower spec? not sure) while the
2239
# contents are used for display in older or less-compliant
2240
# browsers. We make label objects for both, if the values are
2242
label = attrs.get("label")
2244
o._labels.append(Label({"__text": label}))
2245
if contents and contents != label:
2246
o._labels.append(Label({"__text": contents}))
2248
o._labels.append(Label({"__text": contents}))
2251
ListControl.fixup(self)
2252
# Firefox doesn't exclude disabled items from those considered here
2253
# (i.e. from 'found', for both branches of the if below). Note that
2254
# IE6 doesn't support the disabled attribute on OPTIONs at all.
2255
found = [o for o in self.items if o.selected]
2257
if not self.multiple or self._select_default:
2258
for o in self.items:
2260
was_disabled = self.disabled
2261
self.disabled = False
2265
o.disabled = was_disabled
2267
elif not self.multiple:
2268
# Ensure only one item selected. Choose the last one,
2269
# following IE and Firefox.
2270
for o in found[:-1]:
2274
#---------------------------------------------------
2275
class SubmitControl(ScalarControl):
2283
def __init__(self, type, name, attrs, index=None):
2284
ScalarControl.__init__(self, type, name, attrs, index)
2285
# IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it
2286
# blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem
2288
if self.value is None: self.value = ""
2289
self.readonly = True
2291
def get_labels(self):
2294
res.append(Label({"__text": self.value}))
2295
res.extend(ScalarControl.get_labels(self))
2298
def is_of_kind(self, kind): return kind == "clickable"
2300
def _click(self, form, coord, return_type, request_class=urllib2.Request):
2301
self._clicked = coord
2302
r = form._switch_click(return_type, request_class)
2303
self._clicked = False
2306
def _totally_ordered_pairs(self):
2307
if not self._clicked:
2309
return ScalarControl._totally_ordered_pairs(self)
2312
#---------------------------------------------------
2313
class ImageControl(SubmitControl):
2319
Coordinates are specified using one of the HTMLForm.click* methods.
2322
def __init__(self, type, name, attrs, index=None):
2323
SubmitControl.__init__(self, type, name, attrs, index)
2324
self.readonly = False
2326
def _totally_ordered_pairs(self):
2327
clicked = self._clicked
2328
if self.disabled or not clicked:
2331
if name is None: return []
2333
(self._index, "%s.x" % name, str(clicked[0])),
2334
(self._index+1, "%s.y" % name, str(clicked[1])),
2338
pairs.append((self._index+2, name, value))
2341
get_labels = ScalarControl.get_labels
2343
# aliases, just to make str(control) and str(form) clearer
2344
class PasswordControl(TextControl): pass
2345
class HiddenControl(TextControl): pass
2346
class TextareaControl(TextControl): pass
2347
class SubmitButtonControl(SubmitControl): pass
2350
def is_listcontrol(control): return control.is_of_kind("list")
2354
"""Represents a single HTML <form> ... </form> element.
2356
A form consists of a sequence of controls that usually have names, and
2357
which can take on various values. The values of the various types of
2358
controls represent variously: text, zero-or-one-of-many or many-of-many
2359
choices, and files to be uploaded. Some controls can be clicked on to
2360
submit the form, and clickable controls' values sometimes include the
2361
coordinates of the click.
2363
Forms can be filled in with data to be returned to the server, and then
2364
submitted, using the click method to generate a request object suitable for
2365
passing to urllib2.urlopen (or the click_request_data or click_pairs
2366
methods if you're not using urllib2).
2369
forms = ClientForm.ParseFile(html, base_uri)
2372
form["query"] = "Python"
2373
form.find_control("nr_results").get("lots").selected = True
2375
response = urllib2.urlopen(form.click())
2377
Usually, HTMLForm instances are not created directly. Instead, the
2378
ParseFile or ParseResponse factory functions are used. If you do construct
2379
HTMLForm objects yourself, however, note that an HTMLForm instance is only
2380
properly initialised after the fixup method has been called (ParseFile and
2381
ParseResponse do this for you). See ListControl.__doc__ for the reason
2384
Indexing a form (form["control_name"]) returns the named Control's value
2385
attribute. Assignment to a form index (form["control_name"] = something)
2386
is equivalent to assignment to the named Control's value attribute. If you
2387
need to be more specific than just supplying the control's name, use the
2388
set_value and get_value methods.
2390
ListControl values are lists of item names (specifically, the names of the
2391
items that are selected and not disabled, and hence are "successful" -- ie.
2392
cause data to be returned to the server). The list item's name is the
2393
value of the corresponding HTML element's"value" attribute.
2397
<INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT>
2398
<INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT>
2400
defines a CHECKBOX control with name "cheeses" which has two items, named
2401
"leicester" and "cheddar".
2405
<SELECT name="more_cheeses">
2407
<OPTION value="2" label="CHEDDAR">cheddar</OPTION>
2410
defines a SELECT control with name "more_cheeses" which has two items,
2411
named "1" and "2" (because the OPTION element's value HTML attribute
2412
defaults to the element contents -- see SelectControl.__doc__ for more on
2413
these defaulting rules).
2415
To select, deselect or otherwise manipulate individual list items, use the
2416
HTMLForm.find_control() and ListControl.get() methods. To set the whole
2417
value, do as for any other control: use indexing or the set_/get_value
2422
# select *only* the item named "cheddar"
2423
form["cheeses"] = ["cheddar"]
2424
# select "cheddar", leave other items unaffected
2425
form.find_control("cheeses").get("cheddar").selected = True
2427
Some controls (RADIO and SELECT without the multiple attribute) can only
2428
have zero or one items selected at a time. Some controls (CHECKBOX and
2429
SELECT with the multiple attribute) can have multiple items selected at a
2430
time. To set the whole value of a ListControl, assign a sequence to a form
2433
form["cheeses"] = ["cheddar", "leicester"]
2435
If the ListControl is not multiple-selection, the assigned list must be of
2438
To check if a control has an item, if an item is selected, or if an item is
2439
successful (selected and not disabled), respectively:
2441
"cheddar" in [item.name for item in form.find_control("cheeses").items]
2442
"cheddar" in [item.name for item in form.find_control("cheeses").items and
2444
"cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses"))
2446
Note that some list items may be disabled (see below).
2448
Note the following mistake:
2450
form[control_name] = control_value
2451
assert form[control_name] == control_value # not necessarily true
2453
The reason for this is that form[control_name] always gives the list items
2454
in the order they were listed in the HTML.
2456
List items (hence list values, too) can be referred to in terms of list
2457
item labels rather than list item names using the appropriate label
2458
arguments. Note that each item may have several labels.
2460
The question of default values of OPTION contents, labels and values is
2461
somewhat complicated: see SelectControl.__doc__ and
2462
ListControl.get_item_attrs.__doc__ if you think you need to know.
2464
Controls can be disabled or readonly. In either case, the control's value
2465
cannot be changed until you clear those flags (see example below).
2466
Disabled is the state typically represented by browsers by 'greying out' a
2467
control. Disabled controls are not 'successful' -- they don't cause data
2468
to get returned to the server. Readonly controls usually appear in
2469
browsers as read-only text boxes. Readonly controls are successful. List
2470
items can also be disabled. Attempts to select or deselect disabled items
2471
fail with AttributeError.
2473
If a lot of controls are readonly, it can be useful to do this:
2475
form.set_all_readonly(False)
2477
To clear a control's value attribute, so that it is not successful (until a
2478
value is subsequently set):
2480
form.clear("cheeses")
2484
control = form.find_control("cheeses")
2485
control.disabled = False
2486
control.readonly = False
2487
control.get("gruyere").disabled = True
2488
control.items[0].selected = True
2490
See the various Control classes for further documentation. Many methods
2491
take name, type, kind, id, label and nr arguments to specify the control to
2492
be operated on: see HTMLForm.find_control.__doc__.
2494
ControlNotFoundError (subclass of ValueError) is raised if the specified
2495
control can't be found. This includes occasions where a non-ListControl
2496
is found, but the method (set, for example) requires a ListControl.
2497
ItemNotFoundError (subclass of ValueError) is raised if a list item can't
2498
be found. ItemCountError (subclass of ValueError) is raised if an attempt
2499
is made to select more than one item and the control doesn't allow that, or
2500
set/get_single are called and the control contains more than one item.
2501
AttributeError is raised if a control or item is readonly or disabled and
2502
an attempt is made to alter its value.
2504
Security note: Remember that any passwords you store in HTMLForm instances
2505
will be saved to disk in the clear if you pickle them (directly or
2506
indirectly). The simplest solution to this is to avoid pickling HTMLForm
2507
objects. You could also pickle before filling in any password, or just set
2508
the password to "" before pickling.
2513
action: full (absolute URI) form action
2514
method: "GET" or "POST"
2515
enctype: form transfer encoding MIME type
2516
name: name of form (None if no name was specified)
2517
attrs: dictionary mapping original HTML form attributes to their values
2519
controls: list of Control instances; do not alter this list
2520
(instead, call form.new_control to make a Control and add it to the
2521
form, or control.add_to_form if you already have a Control instance)
2525
Methods for form filling:
2526
-------------------------
2528
Most of the these methods have very similar arguments. See
2529
HTMLForm.find_control.__doc__ for details of the name, type, kind, label
2532
def find_control(self,
2533
name=None, type=None, kind=None, id=None, predicate=None,
2534
nr=None, label=None)
2536
get_value(name=None, type=None, kind=None, id=None, nr=None,
2537
by_label=False, # by_label is deprecated
2540
name=None, type=None, kind=None, id=None, nr=None,
2541
by_label=False, # by_label is deprecated
2545
clear(name=None, type=None, kind=None, id=None, nr=None, label=None)
2547
set_all_readonly(readonly)
2550
Method applying only to FileControls:
2552
add_file(file_object,
2553
content_type="application/octet-stream", filename=None,
2554
name=None, id=None, nr=None, label=None)
2557
Methods applying only to clickable controls:
2559
click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
2560
click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1),
2562
click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
2567
"text": TextControl,
2568
"password": PasswordControl,
2569
"hidden": HiddenControl,
2570
"textarea": TextareaControl,
2572
"isindex": IsindexControl,
2574
"file": FileControl,
2576
"button": IgnoreControl,
2577
"buttonbutton": IgnoreControl,
2578
"reset": IgnoreControl,
2579
"resetbutton": IgnoreControl,
2581
"submit": SubmitControl,
2582
"submitbutton": SubmitButtonControl,
2583
"image": ImageControl,
2585
"radio": RadioControl,
2586
"checkbox": CheckboxControl,
2587
"select": SelectControl,
2590
#---------------------------------------------------
2591
# Initialisation. Use ParseResponse / ParseFile instead.
2593
def __init__(self, action, method="GET",
2594
enctype="application/x-www-form-urlencoded",
2595
name=None, attrs=None,
2596
request_class=urllib2.Request,
2597
forms=None, labels=None, id_to_labels=None,
2598
backwards_compat=True):
2600
In the usual case, use ParseResponse (or ParseFile) to create new
2603
action: full (absolute URI) form action
2604
method: "GET" or "POST"
2605
enctype: form transfer encoding MIME type
2607
attrs: dictionary mapping original HTML form attributes to their values
2610
self.action = action
2611
self.method = method
2612
self.enctype = enctype
2614
if attrs is not None:
2615
self.attrs = attrs.copy()
2619
self._request_class = request_class
2621
# these attributes are used by zope.testbrowser
2622
self._forms = forms # this is a semi-public API!
2623
self._labels = labels # this is a semi-public API!
2624
self._id_to_labels = id_to_labels # this is a semi-public API!
2626
self.backwards_compat = backwards_compat # note __setattr__
2628
def __getattr__(self, name):
2629
if name == "backwards_compat":
2630
return self._backwards_compat
2631
return getattr(HTMLForm, name)
2633
def __setattr__(self, name, value):
2635
if name == "backwards_compat":
2636
name = "_backwards_compat"
2638
for cc in self.controls:
2641
except AttributeError:
2645
for ll in ii.get_labels():
2646
ll._backwards_compat = value
2647
self.__dict__[name] = value
2649
def new_control(self, type, name, attrs,
2650
ignore_unknown=False, select_default=False, index=None):
2651
"""Adds a new control to the form.
2653
This is usually called by ParseFile and ParseResponse. Don't call it
2654
youself unless you're building your own Control instances.
2656
Note that controls representing lists of items are built up from
2657
controls holding only a single list item. See ListControl.__doc__ for
2658
further information.
2660
type: type of control (see Control.__doc__ for a list)
2661
attrs: HTML attributes of control
2662
ignore_unknown: if true, use a dummy Control instance for controls of
2663
unknown type; otherwise, use a TextControl
2664
select_default: for RADIO and multiple-selection SELECT controls, pick
2665
the first item as the default if no 'selected' HTML attribute is
2666
present (this defaulting happens when the HTMLForm.fixup method is
2668
index: index of corresponding element in HTML (see
2669
MoreFormTests.test_interspersed_controls for motivation)
2673
klass = self.type2class.get(type)
2676
klass = IgnoreControl
2681
if issubclass(klass, ListControl):
2682
control = klass(type, name, a, select_default, index)
2684
control = klass(type, name, a, index)
2685
control.add_to_form(self)
2688
"""Normalise form after all controls have been added.
2690
This is usually called by ParseFile and ParseResponse. Don't call it
2691
youself unless you're building your own Control instances.
2693
This method should only be called once, after all controls have been
2697
for control in self.controls:
2699
self.backwards_compat = self._backwards_compat
2701
#---------------------------------------------------
2703
header = "%s%s %s %s" % (
2704
(self.name and self.name+" " or ""),
2705
self.method, self.action, self.enctype)
2707
for control in self.controls:
2708
rep.append(" %s" % str(control))
2709
return "<%s>" % "\n".join(rep)
2711
#---------------------------------------------------
2712
# Form-filling methods.
2714
def __getitem__(self, name):
2715
return self.find_control(name).value
2716
def __contains__(self, name):
2717
return bool(self.find_control(name))
2718
def __setitem__(self, name, value):
2719
control = self.find_control(name)
2721
control.value = value
2722
except AttributeError, e:
2723
raise ValueError(str(e))
2726
name=None, type=None, kind=None, id=None, nr=None,
2727
by_label=False, # by_label is deprecated
2729
"""Return value of control.
2731
If only name and value arguments are supplied, equivalent to
2737
deprecation("form.get_value_by_label(...)")
2738
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2741
meth = c.get_value_by_label
2742
except AttributeError:
2743
raise NotImplementedError(
2744
"control '%s' does not yet support by_label" % c.name)
2749
def set_value(self, value,
2750
name=None, type=None, kind=None, id=None, nr=None,
2751
by_label=False, # by_label is deprecated
2753
"""Set value of control.
2755
If only name and value arguments are supplied, equivalent to
2761
deprecation("form.get_value_by_label(...)")
2762
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2765
meth = c.set_value_by_label
2766
except AttributeError:
2767
raise NotImplementedError(
2768
"control '%s' does not yet support by_label" % c.name)
2773
def get_value_by_label(
2774
self, name=None, type=None, kind=None, id=None, label=None, nr=None):
2777
All arguments should be passed by name.
2780
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2781
return c.get_value_by_label()
2783
def set_value_by_label(
2785
name=None, type=None, kind=None, id=None, label=None, nr=None):
2788
All arguments should be passed by name.
2791
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2792
c.set_value_by_label(value)
2794
def set_all_readonly(self, readonly):
2795
for control in self.controls:
2796
control.readonly = bool(readonly)
2798
def clear_all(self):
2799
"""Clear the value attributes of all controls in the form.
2801
See HTMLForm.clear.__doc__.
2804
for control in self.controls:
2808
name=None, type=None, kind=None, id=None, nr=None, label=None):
2809
"""Clear the value attribute of a control.
2811
As a result, the affected control will not be successful until a value
2812
is subsequently set. AttributeError is raised on readonly controls.
2815
c = self.find_control(name, type, kind, id, label=label, nr=nr)
2819
#---------------------------------------------------
2820
# Form-filling methods applying only to ListControls.
2822
def possible_items(self, # deprecated
2823
name=None, type=None, kind=None, id=None,
2824
nr=None, by_label=False, label=None):
2825
"""Return a list of all values that the specified control can take."""
2826
c = self._find_list_control(name, type, kind, id, label, nr)
2827
return c.possible_items(by_label)
2829
def set(self, selected, item_name, # deprecated
2830
name=None, type=None, kind=None, id=None, nr=None,
2831
by_label=False, label=None):
2832
"""Select / deselect named list item.
2834
selected: boolean selected state
2837
self._find_list_control(name, type, kind, id, label, nr).set(
2838
selected, item_name, by_label)
2839
def toggle(self, item_name, # deprecated
2840
name=None, type=None, kind=None, id=None, nr=None,
2841
by_label=False, label=None):
2842
"""Toggle selected state of named list item."""
2843
self._find_list_control(name, type, kind, id, label, nr).toggle(
2844
item_name, by_label)
2846
def set_single(self, selected, # deprecated
2847
name=None, type=None, kind=None, id=None,
2848
nr=None, by_label=None, label=None):
2849
"""Select / deselect list item in a control having only one item.
2851
If the control has multiple list items, ItemCountError is raised.
2853
This is just a convenience method, so you don't need to know the item's
2854
name -- the item name in these single-item controls is usually
2855
something meaningless like "1" or "on".
2857
For example, if a checkbox has a single item named "on", the following
2858
two calls are equivalent:
2860
control.toggle("on")
2861
control.toggle_single()
2863
""" # by_label ignored and deprecated
2864
self._find_list_control(
2865
name, type, kind, id, label, nr).set_single(selected)
2866
def toggle_single(self, name=None, type=None, kind=None, id=None,
2867
nr=None, by_label=None, label=None): # deprecated
2868
"""Toggle selected state of list item in control having only one item.
2870
The rest is as for HTMLForm.set_single.__doc__.
2872
""" # by_label ignored and deprecated
2873
self._find_list_control(name, type, kind, id, label, nr).toggle_single()
2875
#---------------------------------------------------
2876
# Form-filling method applying only to FileControls.
2878
def add_file(self, file_object, content_type=None, filename=None,
2879
name=None, id=None, nr=None, label=None):
2880
"""Add a file to be uploaded.
2882
file_object: file-like object (with read method) from which to read
2884
content_type: MIME content type of data to upload
2885
filename: filename to pass to server
2887
If filename is None, no filename is sent to the server.
2889
If content_type is None, the content type is guessed based on the
2890
filename and the data from read from the file object.
2893
At the moment, guessed content type is always application/octet-stream.
2894
Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and
2897
Note the following useful HTML attributes of file upload controls (see
2898
HTML 4.01 spec, section 17):
2900
accept: comma-separated list of content types that the server will
2901
handle correctly; you can use this to filter out non-conforming files
2902
size: XXX IIRC, this is indicative of whether form wants multiple or
2904
maxlength: XXX hint of max content length in bytes?
2907
self.find_control(name, "file", id=id, label=label, nr=nr).add_file(
2908
file_object, content_type, filename)
2910
#---------------------------------------------------
2911
# Form submission methods, applying only to clickable controls.
2913
def click(self, name=None, type=None, id=None, nr=0, coord=(1,1),
2914
request_class=urllib2.Request,
2916
"""Return request that would result from clicking on a control.
2918
The request object is a urllib2.Request instance, which you can pass to
2919
urllib2.urlopen (or ClientCookie.urlopen).
2921
Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and
2922
IMAGEs) can be clicked.
2924
Will click on the first clickable control, subject to the name, type
2925
and nr arguments (as for find_control). If no name, type, id or number
2926
is specified and there are no clickable controls, a request will be
2927
returned for the form in its current, un-clicked, state.
2929
IndexError is raised if any of name, type, id or nr is specified but no
2930
matching control is found. ValueError is raised if the HTMLForm has an
2931
enctype attribute that is not recognised.
2933
You can optionally specify a coordinate to click at, which only makes a
2934
difference if you clicked on an image.
2937
return self._click(name, type, id, label, nr, coord, "request",
2938
self._request_class)
2940
def click_request_data(self,
2941
name=None, type=None, id=None,
2943
request_class=urllib2.Request,
2945
"""As for click method, but return a tuple (url, data, headers).
2947
You can use this data to send a request to the server. This is useful
2948
if you're using httplib or urllib rather than urllib2. Otherwise, use
2951
# Untested. Have to subclass to add headers, I think -- so use urllib2
2954
url, data, hdrs = form.click_request_data()
2955
r = urllib.urlopen(url, data)
2957
# Untested. I don't know of any reason to use httplib -- you can get
2958
# just as much control with urllib2.
2959
import httplib, urlparse
2960
url, data, hdrs = form.click_request_data()
2962
host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:])
2963
conn = httplib.HTTPConnection(host)
2965
httplib.request("POST", path, data, hdrs)
2967
httplib.request("GET", path, headers=hdrs)
2968
r = conn.getresponse()
2971
return self._click(name, type, id, label, nr, coord, "request_data",
2972
self._request_class)
2974
def click_pairs(self, name=None, type=None, id=None,
2977
"""As for click_request_data, but returns a list of (key, value) pairs.
2979
You can use this list as an argument to ClientForm.urlencode. This is
2980
usually only useful if you're using httplib or urllib rather than
2981
urllib2 or ClientCookie. It may also be useful if you want to manually
2982
tweak the keys and/or values, but this should not be necessary.
2983
Otherwise, use the click method.
2985
Note that this method is only useful for forms of MIME type
2986
x-www-form-urlencoded. In particular, it does not return the
2987
information required for file upload. If you need file upload and are
2988
not using urllib2, use click_request_data.
2990
Also note that Python 2.0's urllib.urlencode is slightly broken: it
2991
only accepts a mapping, not a sequence of pairs, as an argument. This
2992
messes up any ordering in the argument. Use ClientForm.urlencode
2996
return self._click(name, type, id, label, nr, coord, "pairs",
2997
self._request_class)
2999
#---------------------------------------------------
3001
def find_control(self,
3002
name=None, type=None, kind=None, id=None,
3003
predicate=None, nr=None,
3005
"""Locate and return some specific control within the form.
3007
At least one of the name, type, kind, predicate and nr arguments must
3008
be supplied. If no matching control is found, ControlNotFoundError is
3011
If name is specified, then the control must have the indicated name.
3013
If type is specified then the control must have the specified type (in
3014
addition to the types possible for <input> HTML tags: "text",
3015
"password", "hidden", "submit", "image", "button", "radio", "checkbox",
3016
"file" we also have "reset", "buttonbutton", "submitbutton",
3017
"resetbutton", "textarea", "select" and "isindex").
3019
If kind is specified, then the control must fall into the specified
3020
group, each of which satisfies a particular interface. The types are
3021
"text", "list", "multilist", "singlelist", "clickable" and "file".
3023
If id is specified, then the control must have the indicated id.
3025
If predicate is specified, then the control must match that function.
3026
The predicate function is passed the control as its single argument,
3027
and should return a boolean value indicating whether the control
3030
nr, if supplied, is the sequence number of the control (where 0 is the
3031
first). Note that control 0 is the first control matching all the
3032
other arguments (if supplied); it is not necessarily the first control
3033
in the form. If no nr is supplied, AmbiguityError is raised if
3034
multiple controls match the other arguments (unless the
3035
.backwards-compat attribute is true).
3037
If label is specified, then the control must have this label. Note
3038
that radio controls and checkboxes never have labels: their items do.
3041
if ((name is None) and (type is None) and (kind is None) and
3042
(id is None) and (label is None) and (predicate is None) and
3045
"at least one argument must be supplied to specify control")
3046
return self._find_control(name, type, kind, id, label, predicate, nr)
3048
#---------------------------------------------------
3051
def _find_list_control(self,
3052
name=None, type=None, kind=None, id=None,
3053
label=None, nr=None):
3054
if ((name is None) and (type is None) and (kind is None) and
3055
(id is None) and (label is None) and (nr is None)):
3057
"at least one argument must be supplied to specify control")
3059
return self._find_control(name, type, kind, id, label,
3062
def _find_control(self, name, type, kind, id, label, predicate, nr):
3063
if (name is not None) and not isstringlike(name):
3064
raise TypeError("control name must be string-like")
3065
if (type is not None) and not isstringlike(type):
3066
raise TypeError("control type must be string-like")
3067
if (kind is not None) and not isstringlike(kind):
3068
raise TypeError("control kind must be string-like")
3069
if (id is not None) and not isstringlike(id):
3070
raise TypeError("control id must be string-like")
3071
if (label is not None) and not isstringlike(label):
3072
raise TypeError("control label must be string-like")
3073
if (predicate is not None) and not callable(predicate):
3074
raise TypeError("control predicate must be callable")
3075
if (nr is not None) and nr < 0:
3076
raise ValueError("control number must be a positive integer")
3081
if nr is None and self.backwards_compat:
3084
for control in self.controls:
3085
if name is not None and name != control.name:
3087
if type is not None and type != control.type:
3089
if kind is not None and not control.is_of_kind(kind):
3091
if id is not None and id != control.id:
3093
if predicate and not predicate(control):
3096
for l in control.get_labels():
3097
if l.text.find(label) > -1:
3103
return control # early exit: unambiguous due to nr
3111
if found and not ambiguous:
3115
if name is not None: description.append("name '%s'" % name)
3116
if type is not None: description.append("type '%s'" % type)
3117
if kind is not None: description.append("kind '%s'" % kind)
3118
if id is not None: description.append("id '%s'" % id)
3119
if label is not None: description.append("label '%s'" % label)
3120
if predicate is not None:
3121
description.append("predicate %s" % predicate)
3122
if orig_nr: description.append("nr %d" % orig_nr)
3123
description = ", ".join(description)
3126
raise AmbiguityError("more than one control matching "+description)
3128
raise ControlNotFoundError("no control matching "+description)
3131
def _click(self, name, type, id, label, nr, coord, return_type,
3132
request_class=urllib2.Request):
3134
control = self._find_control(
3135
name, type, "clickable", id, label, None, nr)
3136
except ControlNotFoundError:
3137
if ((name is not None) or (type is not None) or (id is not None) or
3140
# no clickable controls, but no control was explicitly requested,
3141
# so return state without clicking any control
3142
return self._switch_click(return_type, request_class)
3144
return control._click(self, coord, return_type, request_class)
3147
"""Return sequence of (key, value) pairs suitable for urlencoding."""
3148
return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()]
3151
def _pairs_and_controls(self):
3152
"""Return sequence of (index, key, value, control_index)
3153
of totally ordered pairs suitable for urlencoding.
3155
control_index is the index of the control in self.controls
3158
for control_index in range(len(self.controls)):
3159
control = self.controls[control_index]
3160
for ii, key, val in control._totally_ordered_pairs():
3161
pairs.append((ii, key, val, control_index))
3163
# stable sort by ONLY first item in tuple
3168
def _request_data(self):
3169
"""Return a tuple (url, data, headers)."""
3170
method = self.method.upper()
3171
#scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action)
3172
parts = urlparse.urlparse(self.action)
3173
rest, (query, frag) = parts[:-2], parts[-2:]
3176
if self.enctype != "application/x-www-form-urlencoded":
3178
"unknown GET form encoding type '%s'" % self.enctype)
3179
parts = rest + (urlencode(self._pairs()), "")
3180
uri = urlparse.urlunparse(parts)
3181
return uri, None, []
3182
elif method == "POST":
3183
parts = rest + (query, "")
3184
uri = urlparse.urlunparse(parts)
3185
if self.enctype == "application/x-www-form-urlencoded":
3186
return (uri, urlencode(self._pairs()),
3187
[("Content-type", self.enctype)])
3188
elif self.enctype == "multipart/form-data":
3191
mw = MimeWriter(data, http_hdrs)
3192
f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
3194
for ii, k, v, control_index in self._pairs_and_controls():
3195
self.controls[control_index]._write_mime_data(mw, k, v)
3197
return uri, data.getvalue(), http_hdrs
3200
"unknown POST form encoding type '%s'" % self.enctype)
3202
raise ValueError("Unknown method '%s'" % method)
3204
def _switch_click(self, return_type, request_class=urllib2.Request):
3205
# This is called by HTMLForm and clickable Controls to hide switching
3207
if return_type == "pairs":
3208
return self._pairs()
3209
elif return_type == "request_data":
3210
return self._request_data()
3212
req_data = self._request_data()
3213
req = request_class(req_data[0], req_data[1])
3214
for key, val in req_data[2]:
3215
add_hdr = req.add_header
3216
if key.lower() == "content-type":
3218
add_hdr = req.add_unredirected_header
3219
except AttributeError:
3220
# pre-2.4 and not using ClientCookie