1
"""Request body processing for CherryPy.
3
When an HTTP request includes an entity body, it is often desirable to
4
provide that information to applications in a form other than the raw bytes.
5
Different content types demand different approaches. Examples:
7
* For a GIF file, we want the raw bytes in a stream.
8
* An HTML form is better parsed into its component fields, and each text field
9
decoded from bytes to unicode.
10
* A JSON body should be deserialized into a Python dict or list.
12
When the request contains a Content-Type header, the media type is used as a
13
key to look up a value in the 'request.body.processors' dict. If the full media
14
type is not found, then the major type is tried; for example, if no processor
15
is found for the 'image/jpeg' type, then we look for a processor for the 'image'
16
types altogether. If neither the full type nor the major type has a matching
17
processor, then a default processor is used (self.default_proc). For most
18
types, this means no processing is done, and the body is left unread as a
19
raw byte stream. Processors are configurable in an 'on_start_resource' hook.
21
Some processors, especially those for the 'text' types, attempt to decode bytes
22
to unicode. If the Content-Type request header includes a 'charset' parameter,
23
this is used to decode the entity. Otherwise, one or more default charsets may
24
be attempted, although this decision is up to each processor. If a processor
25
successfully decodes an Entity or Part, it should set the 'charset' attribute
26
on the Entity or Part to the name of the successful charset, so that
27
applications can easily re-encode or transcode the value if they wish.
29
If the Content-Type of the request entity is of major type 'multipart', then
30
the above parsing process, and possibly a decoding process, is performed for
33
For both the full entity and multipart parts, a Content-Disposition header may
34
be used to fill .name and .filename attributes on the request.body or the Part.
39
from urllib import unquote_plus
42
from cherrypy.lib import httputil
45
# -------------------------------- Processors -------------------------------- #
47
def process_urlencoded(entity):
48
"""Read application/x-www-form-urlencoded data into entity.params."""
50
for charset in entity.attempt_charsets:
53
for aparam in qs.split('&'):
54
for pair in aparam.split(';'):
58
atoms = pair.split('=', 1)
62
key = unquote_plus(atoms[0]).decode(charset)
63
value = unquote_plus(atoms[1]).decode(charset)
66
if not isinstance(params[key], list):
67
params[key] = [params[key]]
68
params[key].append(value)
71
except UnicodeDecodeError:
74
entity.charset = charset
77
raise cherrypy.HTTPError(
78
400, "The request entity could not be decoded. The following "
79
"charsets were attempted: %s" % repr(entity.attempt_charsets))
81
# Now that all values have been successfully parsed and decoded,
82
# apply them to the entity.params dict.
83
for key, value in params.items():
84
if key in entity.params:
85
if not isinstance(entity.params[key], list):
86
entity.params[key] = [entity.params[key]]
87
entity.params[key].append(value)
89
entity.params[key] = value
92
def process_multipart(entity):
93
"""Read all multipart parts into entity.parts."""
95
if u'boundary' in entity.content_type.params:
96
# http://tools.ietf.org/html/rfc2046#section-5.1.1
97
# "The grammar for parameters on the Content-type field is such that it
98
# is often necessary to enclose the boundary parameter values in quotes
99
# on the Content-type line"
100
ib = entity.content_type.params['boundary'].strip(u'"')
102
if not re.match(u"^[ -~]{0,200}[!-~]$", ib):
103
raise ValueError(u'Invalid boundary in multipart form: %r' % (ib,))
105
ib = (u'--' + ib).encode('ascii')
107
# Find the first marker
109
b = entity.readline()
119
part = entity.part_class.from_fp(entity.fp, ib)
120
entity.parts.append(part)
125
def process_multipart_form_data(entity):
126
"""Read all multipart/form-data parts into entity.parts or entity.params."""
127
process_multipart(entity)
130
for part in entity.parts:
131
if part.name is None:
132
kept_parts.append(part)
134
if part.filename is None:
135
# It's a regular field
136
entity.params[part.name] = part.fullvalue()
138
# It's a file upload. Retain the whole part so consumer code
139
# has access to its .file and .filename attributes.
140
entity.params[part.name] = part
142
entity.parts = kept_parts
144
def _old_process_multipart(entity):
145
"""The behavior of 3.2 and lower. Deprecated and will be changed in 3.3."""
146
process_multipart(entity)
148
params = entity.params
150
for part in entity.parts:
151
if part.name is None:
156
if part.filename is None:
157
# It's a regular field
158
value = part.fullvalue()
160
# It's a file upload. Retain the whole part so consumer code
161
# has access to its .file and .filename attributes.
165
if not isinstance(params[key], list):
166
params[key] = [params[key]]
167
params[key].append(value)
173
# --------------------------------- Entities --------------------------------- #
176
class Entity(object):
177
"""An HTTP request body, or MIME multipart body."""
179
__metaclass__ = cherrypy._AttributeDocstrings
183
If the request Content-Type is 'application/x-www-form-urlencoded' or
184
multipart, this will be a dict of the params pulled from the entity
185
body; that is, it will be the portion of request.params that come
186
from the message body (sometimes called "POST params", although they
187
can be sent with various HTTP method verbs). This value is set between
188
the 'before_request_body' and 'before_handler' hooks (assuming that
189
process_request_body is True)."""
191
default_content_type = u'application/x-www-form-urlencoded'
192
# http://tools.ietf.org/html/rfc2046#section-4.1.2:
193
# "The default character set, which must be assumed in the
194
# absence of a charset parameter, is US-ASCII."
195
# However, many browsers send data in utf-8 with no charset.
196
attempt_charsets = [u'utf-8']
197
processors = {u'application/x-www-form-urlencoded': process_urlencoded,
198
u'multipart/form-data': process_multipart_form_data,
199
u'multipart': process_multipart,
202
def __init__(self, fp, headers, params=None, parts=None):
203
# Make an instance-specific copy of the class processors
204
# so Tools, etc. can replace them per-request.
205
self.processors = self.processors.copy()
208
self.headers = headers
219
self.content_type = headers.elements(u'Content-Type')
220
if self.content_type:
221
self.content_type = self.content_type[0]
223
self.content_type = httputil.HeaderElement.from_str(
224
self.default_content_type)
226
# Copy the class 'attempt_charsets', prepending any Content-Type charset
227
dec = self.content_type.params.get(u"charset", None)
229
dec = dec.decode('ISO-8859-1')
230
self.attempt_charsets = [dec] + [c for c in self.attempt_charsets
233
self.attempt_charsets = self.attempt_charsets[:]
237
clen = headers.get(u'Content-Length', None)
238
# If Transfer-Encoding is 'chunked', ignore any Content-Length.
239
if clen is not None and 'chunked' not in headers.get(u'Transfer-Encoding', ''):
241
self.length = int(clen)
245
# Content-Disposition
248
disp = headers.elements(u'Content-Disposition')
251
if 'name' in disp.params:
252
self.name = disp.params['name']
253
if self.name.startswith(u'"') and self.name.endswith(u'"'):
254
self.name = self.name[1:-1]
255
if 'filename' in disp.params:
256
self.filename = disp.params['filename']
257
if self.filename.startswith(u'"') and self.filename.endswith(u'"'):
258
self.filename = self.filename[1:-1]
260
# The 'type' attribute is deprecated in 3.2; remove it in 3.3.
261
type = property(lambda self: self.content_type)
263
def read(self, size=None, fp_out=None):
264
return self.fp.read(size, fp_out)
266
def readline(self, size=None):
267
return self.fp.readline(size)
269
def readlines(self, sizehint=None):
270
return self.fp.readlines(sizehint)
276
line = self.readline()
281
def read_into_file(self, fp_out=None):
282
"""Read the request body into fp_out (or make_file() if None). Return fp_out."""
284
fp_out = self.make_file()
285
self.read(fp_out=fp_out)
289
"""Return a file into which the request body will be read.
291
By default, this will return a TemporaryFile. Override as needed."""
292
return tempfile.TemporaryFile()
295
"""Return this entity as a string, whether stored in a file or not."""
297
# It was stored in a tempfile. Read it.
299
value = self.file.read()
306
"""Execute the best-match processor for the given media type."""
308
ct = self.content_type.value
310
proc = self.processors[ct]
312
toptype = ct.split(u'/', 1)[0]
314
proc = self.processors[toptype]
322
def default_proc(self):
323
# Leave the fp alone for someone else to read. This works fine
324
# for request.body, but the Part subclasses need to override this
325
# so they can move on to the next part.
330
"""A MIME part entity, part of a multipart entity."""
332
default_content_type = u'text/plain'
333
# "The default character set, which must be assumed in the absence of a
334
# charset parameter, is US-ASCII."
335
attempt_charsets = [u'us-ascii', u'utf-8']
336
# This is the default in stdlib cgi. We may want to increase it.
339
def __init__(self, fp, headers, boundary):
340
Entity.__init__(self, fp, headers)
341
self.boundary = boundary
345
def from_fp(cls, fp, boundary):
346
headers = cls.read_headers(fp)
347
return cls(fp, headers, boundary)
348
from_fp = classmethod(from_fp)
350
def read_headers(cls, fp):
351
headers = httputil.HeaderMap()
355
# No more data--illegal end of headers
356
raise EOFError(u"Illegal end of headers.")
359
# Normal end of headers
361
if not line.endswith('\r\n'):
362
raise ValueError(u"MIME requires CRLF terminators: %r" % line)
365
# It's a continuation line.
366
v = line.strip().decode(u'ISO-8859-1')
368
k, v = line.split(":", 1)
369
k = k.strip().decode(u'ISO-8859-1')
370
v = v.strip().decode(u'ISO-8859-1')
372
existing = headers.get(k)
374
v = u", ".join((existing, v))
378
read_headers = classmethod(read_headers)
380
def read_lines_to_boundary(self, fp_out=None):
381
"""Read bytes from self.fp and return or write them to a file.
383
If the 'fp_out' argument is None (the default), all bytes read are
384
returned in a single byte string.
386
If the 'fp_out' argument is not None, it must be a file-like object that
387
supports the 'write' method; all bytes read will be written to the fp,
388
and that fp is returned.
390
endmarker = self.boundary + "--"
396
line = self.fp.readline(1<<16)
398
raise EOFError(u"Illegal end of multipart body.")
399
if line.startswith("--") and prev_lf:
400
strippedline = line.strip()
401
if strippedline == self.boundary:
403
if strippedline == endmarker:
409
if line.endswith("\r\n"):
413
elif line.endswith("\n"):
424
if seen > self.maxrambytes:
425
fp_out = self.make_file()
432
result = ''.join(lines)
433
for charset in self.attempt_charsets:
435
result = result.decode(charset)
436
except UnicodeDecodeError:
439
self.charset = charset
442
raise cherrypy.HTTPError(
443
400, "The request entity could not be decoded. The following "
444
"charsets were attempted: %s" % repr(self.attempt_charsets))
449
def default_proc(self):
451
# Always read into a file if a .filename was given.
452
self.file = self.read_into_file()
454
result = self.read_lines_to_boundary()
455
if isinstance(result, basestring):
460
def read_into_file(self, fp_out=None):
461
"""Read the request body into fp_out (or make_file() if None). Return fp_out."""
463
fp_out = self.make_file()
464
self.read_lines_to_boundary(fp_out=fp_out)
467
Entity.part_class = Part
470
class Infinity(object):
471
def __cmp__(self, other):
473
def __sub__(self, other):
478
comma_separated_headers = ['Accept', 'Accept-Charset', 'Accept-Encoding',
479
'Accept-Language', 'Accept-Ranges', 'Allow', 'Cache-Control', 'Connection',
480
'Content-Encoding', 'Content-Language', 'Expect', 'If-Match',
481
'If-None-Match', 'Pragma', 'Proxy-Authenticate', 'Te', 'Trailer',
482
'Transfer-Encoding', 'Upgrade', 'Vary', 'Via', 'Warning', 'Www-Authenticate']
487
def __init__(self, fp, length, maxbytes, bufsize=8192, has_trailers=False):
488
# Wrap our fp in a buffer so peek() works
491
self.maxbytes = maxbytes
493
self.bufsize = bufsize
496
self.has_trailers = has_trailers
498
def read(self, size=None, fp_out=None):
499
"""Read bytes from the request body and return or write them to a file.
501
A number of bytes less than or equal to the 'size' argument are read
502
off the socket. The actual number of bytes read are tracked in
503
self.bytes_read. The number may be smaller than 'size' when 1) the
504
client sends fewer bytes, 2) the 'Content-Length' request header
505
specifies fewer bytes than requested, or 3) the number of bytes read
506
exceeds self.maxbytes (in which case, 413 is raised).
508
If the 'fp_out' argument is None (the default), all bytes read are
509
returned in a single byte string.
511
If the 'fp_out' argument is not None, it must be a file-like object that
512
supports the 'write' method; all bytes read will be written to the fp,
513
and None is returned.
516
if self.length is None:
522
remaining = self.length - self.bytes_read
523
if size and size < remaining:
534
# Read bytes from the buffer.
540
data = self.buffer[:remaining]
541
self.buffer = self.buffer[remaining:]
546
self.bytes_read += datalen
547
if self.maxbytes and self.bytes_read > self.maxbytes:
548
raise cherrypy.HTTPError(413)
556
# Read bytes from the socket.
558
chunksize = min(remaining, self.bufsize)
560
data = self.fp.read(chunksize)
562
if e.__class__.__name__ == 'MaxSizeExceeded':
563
# Post data is too big
564
raise cherrypy.HTTPError(
565
413, "Maximum request length: %r" % e.args[1])
575
self.bytes_read += datalen
576
if self.maxbytes and self.bytes_read > self.maxbytes:
577
raise cherrypy.HTTPError(413)
586
return ''.join(chunks)
588
def readline(self, size=None):
589
"""Read a line from the request body and return it."""
591
while size is None or size > 0:
592
chunksize = self.bufsize
593
if size is not None and size < self.bufsize:
595
data = self.read(chunksize)
598
pos = data.find('\n') + 1
600
chunks.append(data[:pos])
601
remainder = data[pos:]
602
self.buffer += remainder
603
self.bytes_read -= len(remainder)
607
return ''.join(chunks)
609
def readlines(self, sizehint=None):
610
"""Read lines from the request body and return them."""
611
if self.length is not None:
613
sizehint = self.length - self.bytes_read
615
sizehint = min(sizehint, self.length - self.bytes_read)
620
line = self.readline()
631
if self.has_trailers and hasattr(self.fp, 'read_trailer_lines'):
635
for line in self.fp.read_trailer_lines():
637
# It's a continuation line.
641
k, v = line.split(":", 1)
643
raise ValueError("Illegal header line.")
644
k = k.strip().title()
647
if k in comma_separated_headers:
648
existing = self.trailers.get(envname)
650
v = ", ".join((existing, v))
653
if e.__class__.__name__ == 'MaxSizeExceeded':
654
# Post data is too big
655
raise cherrypy.HTTPError(
656
413, "Maximum request length: %r" % e.args[1])
661
class RequestBody(Entity):
663
# Don't parse the request body at all if the client didn't provide
664
# a Content-Type header. See http://www.cherrypy.org/ticket/790
665
default_content_type = u''
670
def __init__(self, fp, headers, params=None, request_params=None):
671
Entity.__init__(self, fp, headers, params)
673
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
674
# When no explicit charset parameter is provided by the
675
# sender, media subtypes of the "text" type are defined
676
# to have a default charset value of "ISO-8859-1" when
678
if self.content_type.value.startswith('text/'):
679
for c in (u'ISO-8859-1', u'iso-8859-1', u'Latin-1', u'latin-1'):
680
if c in self.attempt_charsets:
683
self.attempt_charsets.append(u'ISO-8859-1')
685
# Temporary fix while deprecating passing .parts as .params.
686
self.processors[u'multipart'] = _old_process_multipart
688
if request_params is None:
690
self.request_params = request_params
693
"""Include body params in request params."""
694
# "The presence of a message-body in a request is signaled by the
695
# inclusion of a Content-Length or Transfer-Encoding header field in
696
# the request's message-headers."
697
# It is possible to send a POST request with no body, for example;
698
# however, app developers are responsible in that case to set
699
# cherrypy.request.process_body to False so this method isn't called.
700
h = cherrypy.serving.request.headers
701
if u'Content-Length' not in h and u'Transfer-Encoding' not in h:
702
raise cherrypy.HTTPError(411)
704
self.fp = SizedReader(self.fp, self.length,
705
self.maxbytes, bufsize=self.bufsize,
706
has_trailers='Trailer' in h)
707
super(RequestBody, self).process()
709
# Body params should also be a part of the request_params
711
request_params = self.request_params
712
for key, value in self.params.items():
713
# Python 2 only: keyword arguments must be byte strings (type 'str').
714
if isinstance(key, unicode):
715
key = key.encode('ISO-8859-1')
717
if key in request_params:
718
if not isinstance(request_params[key], list):
719
request_params[key] = [request_params[key]]
720
request_params[key].append(value)
722
request_params[key] = value