2
Multi-part parsing for file uploads.
4
Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
5
file upload handlers for processing.
9
from django.conf import settings
10
from django.core.exceptions import SuspiciousOperation
11
from django.utils.datastructures import MultiValueDict
12
from django.utils.encoding import force_unicode
13
from django.utils.text import unescape_entities
14
from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers
16
__all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted')
18
class MultiPartParserError(Exception):
21
class InputStreamExhausted(Exception):
23
No more reads are allowed from this device.
31
class MultiPartParser(object):
33
A rfc2388 multipart/form-data parser.
35
``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
36
and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
38
def __init__(self, META, input_data, upload_handlers, encoding=None):
40
Initialize the MultiPartParser object.
43
The standard ``META`` dictionary in Django request objects.
45
The raw post data, as a file-like object.
47
An UploadHandler instance that performs operations on the uploaded
50
The encoding with which to treat the incoming data.
54
# Content-Type should containt multipart and the boundary information.
57
content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', ''))
58
if not content_type.startswith('multipart/'):
59
raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
61
# Parse the header to get the boundary to split the parts.
62
ctypes, opts = parse_header(content_type)
63
boundary = opts.get('boundary')
64
if not boundary or not cgi.valid_boundary(boundary):
65
raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary)
69
# Content-Length should contain the length of the body we are about
73
content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0)))
74
except (ValueError, TypeError):
75
# For now set it to 0; we'll try again later on down.
78
if content_length < 0:
79
# This means we shouldn't continue...raise an error.
80
raise MultiPartParserError("Invalid content length: %r" % content_length)
82
self._boundary = boundary
83
self._input_data = input_data
85
# For compatibility with low-level network APIs (with 32-bit integers),
86
# the chunk size should be < 2^31, but still divisible by 4.
87
possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]
88
self._chunk_size = min([2**31-4] + possible_sizes)
91
self._encoding = encoding or settings.DEFAULT_CHARSET
92
self._content_length = content_length
93
self._upload_handlers = upload_handlers
97
Parse the POST data and break it into a FILES MultiValueDict and a POST
100
Returns a tuple containing the POST and FILES dictionary, respectively.
102
# We have to import QueryDict down here to avoid a circular import.
103
from django.http import QueryDict
105
encoding = self._encoding
106
handlers = self._upload_handlers
108
# HTTP spec says that Content-Length >= 0 is valid
109
# handling content-length == 0 before continuing
110
if self._content_length == 0:
111
return QueryDict(MultiValueDict(), encoding=self._encoding), MultiValueDict()
113
limited_input_data = LimitBytes(self._input_data, self._content_length)
115
# See if the handler will want to take care of the parsing.
116
# This allows overriding everything if somebody wants it.
117
for handler in handlers:
118
result = handler.handle_raw_input(limited_input_data,
120
self._content_length,
123
if result is not None:
124
return result[0], result[1]
126
# Create the data structures to be used later.
127
self._post = QueryDict('', mutable=True)
128
self._files = MultiValueDict()
130
# Instantiate the parser and stream:
131
stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))
133
# Whether or not to signal a file-completion at the beginning of the loop.
134
old_field_name = None
135
counters = [0] * len(handlers)
138
for item_type, meta_data, field_stream in Parser(stream, self._boundary):
140
# We run this at the beginning of the next loop
141
# since we cannot be sure a file is complete until
142
# we hit the next boundary/part of the multipart content.
143
self.handle_file_complete(old_field_name, counters)
144
old_field_name = None
147
disposition = meta_data['content-disposition'][1]
148
field_name = disposition['name'].strip()
149
except (KeyError, IndexError, AttributeError):
152
transfer_encoding = meta_data.get('content-transfer-encoding')
153
field_name = force_unicode(field_name, encoding, errors='replace')
155
if item_type == FIELD:
156
# This is a post field, we can just set it in the post
157
if transfer_encoding == 'base64':
158
raw_data = field_stream.read()
160
data = str(raw_data).decode('base64')
164
data = field_stream.read()
166
self._post.appendlist(field_name,
167
force_unicode(data, encoding, errors='replace'))
168
elif item_type == FILE:
169
# This is a file, use the handler...
170
file_name = disposition.get('filename')
173
file_name = force_unicode(file_name, encoding, errors='replace')
174
file_name = self.IE_sanitize(unescape_entities(file_name))
176
content_type = meta_data.get('content-type', ('',))[0].strip()
178
charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
183
content_length = int(meta_data.get('content-length')[0])
184
except (IndexError, TypeError, ValueError):
185
content_length = None
187
counters = [0] * len(handlers)
189
for handler in handlers:
191
handler.new_file(field_name, file_name,
192
content_type, content_length,
194
except StopFutureHandlers:
197
for chunk in field_stream:
198
if transfer_encoding == 'base64':
199
# We only special-case base64 transfer encoding
201
chunk = str(chunk).decode('base64')
203
# Since this is only a chunk, any error is an unfixable error.
204
raise MultiPartParserError("Could not decode base64 data: %r" % e)
206
for i, handler in enumerate(handlers):
207
chunk_length = len(chunk)
208
chunk = handler.receive_data_chunk(chunk,
210
counters[i] += chunk_length
212
# If the chunk received by the handler is None, then don't continue.
216
# Just use up the rest of this file...
217
exhaust(field_stream)
219
# Handle file upload completions on next iteration.
220
old_field_name = field_name
222
# If this is neither a FIELD or a FILE, just exhaust the stream.
224
except StopUpload, e:
225
if not e.connection_reset:
226
exhaust(limited_input_data)
228
# Make sure that the request data is all fed
229
exhaust(limited_input_data)
231
# Signal that the upload has completed.
232
for handler in handlers:
233
retval = handler.upload_complete()
237
return self._post, self._files
239
def handle_file_complete(self, old_field_name, counters):
241
Handle all the signalling that takes place when a file is complete.
243
for i, handler in enumerate(self._upload_handlers):
244
file_obj = handler.file_complete(counters[i])
246
# If it returns a file object, then set the files dict.
247
self._files.appendlist(force_unicode(old_field_name,
253
def IE_sanitize(self, filename):
254
"""Cleanup filename from Internet Explorer full paths."""
255
return filename and filename[filename.rfind("\\")+1:].strip()
257
class LazyStream(object):
259
The LazyStream wrapper allows one to get and "unget" bytes from a stream.
261
Given a producer object (an iterator that yields bytestrings), the
262
LazyStream object will support iteration, reading, and keeping a "look-back"
263
variable in case you need to "unget" some bytes.
265
def __init__(self, producer, length=None):
267
Every LazyStream must have a producer when instantiated.
269
A producer is an iterable that returns a string each time it
272
self._producer = producer
277
self._remaining = length
278
self._unget_history = []
283
def read(self, size=None):
285
remaining = (size is not None and [size] or [self._remaining])[0]
286
# do the whole thing in one shot if no limit was provided.
287
if remaining is None:
291
# otherwise do some bookkeeping to return exactly enough
292
# of the stream and stashing any extra content we get from
294
while remaining != 0:
295
assert remaining > 0, 'remaining bytes to read should never go negative'
299
emitting = chunk[:remaining]
300
self.unget(chunk[remaining:])
301
remaining -= len(emitting)
304
out = ''.join(parts())
309
Used when the exact number of bytes to read is unimportant.
311
This procedure just returns whatever is chunk is conveniently returned
312
from the iterator instead. Useful to avoid unnecessary bookkeeping if
313
performance is an issue.
316
output = self._leftover
319
output = self._producer.next()
320
self._unget_history = []
321
self.position += len(output)
326
Used to invalidate/disable this lazy stream.
328
Replaces the producer with an empty list. Any leftover bytes that have
329
already been read will still be reported upon read() and/or next().
336
def unget(self, bytes):
338
Places bytes back onto the front of the lazy stream.
340
Future calls to read() will return those bytes first. The
341
stream position and thus tell() will be rewound.
345
self._update_unget_history(len(bytes))
346
self.position -= len(bytes)
347
self._leftover = ''.join([bytes, self._leftover])
349
def _update_unget_history(self, num_bytes):
351
Updates the unget history as a sanity check to see if we've pushed
352
back the same number of bytes in one chunk. If we keep ungetting the
353
same number of bytes many times (here, 50), we're mostly likely in an
354
infinite loop of some sort. This is usually caused by a
355
maliciously-malformed MIME request.
357
self._unget_history = [num_bytes] + self._unget_history[:49]
358
number_equal = len([current_number for current_number in self._unget_history
359
if current_number == num_bytes])
361
if number_equal > 40:
362
raise SuspiciousOperation(
363
"The multipart parser got stuck, which shouldn't happen with"
364
" normal uploaded files. Check for malicious upload activity;"
365
" if there is none, report this to the Django developers."
368
class ChunkIter(object):
370
An iterable that will yield chunks of data. Given a file-like object as the
371
constructor, this object will yield chunks of read operations from that
374
def __init__(self, flo, chunk_size=64 * 1024):
376
self.chunk_size = chunk_size
380
data = self.flo.read(self.chunk_size)
381
except InputStreamExhausted:
382
raise StopIteration()
386
raise StopIteration()
391
class LimitBytes(object):
392
""" Limit bytes for a file object. """
393
def __init__(self, fileobject, length):
394
self._file = fileobject
395
self.remaining = length
397
def read(self, num_bytes=None):
399
Read data from the underlying file.
400
If you ask for too much or there isn't anything left,
401
this will raise an InputStreamExhausted error.
403
if self.remaining <= 0:
404
raise InputStreamExhausted()
405
if num_bytes is None:
406
num_bytes = self.remaining
408
num_bytes = min(num_bytes, self.remaining)
409
self.remaining -= num_bytes
410
return self._file.read(num_bytes)
412
class InterBoundaryIter(object):
414
A Producer that will iterate over boundaries.
416
def __init__(self, stream, boundary):
417
self._stream = stream
418
self._boundary = boundary
425
return LazyStream(BoundaryIter(self._stream, self._boundary))
426
except InputStreamExhausted:
427
raise StopIteration()
429
class BoundaryIter(object):
431
A Producer that is sensitive to boundaries.
433
Will happily yield bytes until a boundary is found. Will yield the bytes
434
before the boundary, throw away the boundary bytes themselves, and push the
435
post-boundary bytes back on the stream.
437
The future calls to .next() after locating the boundary will raise a
438
StopIteration exception.
441
def __init__(self, stream, boundary):
442
self._stream = stream
443
self._boundary = boundary
445
# rollback an additional six bytes because the format is like
446
# this: CRLF<boundary>[--CRLF]
447
self._rollback = len(boundary) + 6
449
# Try to use mx fast string search if available. Otherwise
450
# use Python find. Wrap the latter for consistency.
451
unused_char = self._stream.read(1)
453
raise InputStreamExhausted()
454
self._stream.unget(unused_char)
456
from mx.TextTools import FS
457
self._fs = FS(boundary).find
459
self._fs = lambda data: data.find(boundary)
466
raise StopIteration()
468
stream = self._stream
469
rollback = self._rollback
474
bytes_read += len(bytes)
476
if bytes_read > rollback:
484
raise StopIteration()
486
chunk = ''.join(chunks)
487
boundary = self._find_boundary(chunk, len(chunk) < self._rollback)
491
stream.unget(chunk[next:])
495
# make sure we dont treat a partial boundary (and
496
# its separators) as data
497
if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6):
498
# There's nothing left, we should just return and mark as done.
502
stream.unget(chunk[-rollback:])
503
return chunk[:-rollback]
505
def _find_boundary(self, data, eof = False):
507
Finds a multipart boundary in data.
509
Should no boundry exist in the data None is returned instead. Otherwise
510
a tuple containing the indices of the following are returned:
512
* the end of current encapsulation
513
* the start of the next encapsulation
515
index = self._fs(data)
520
next = index + len(self._boundary)
522
if data[max(0,end-1)] == '\n':
524
if data[max(0,end-1)] == '\r':
528
def exhaust(stream_or_iterable):
530
Completely exhausts an iterator or stream.
532
Raise a MultiPartParserError if the argument is not a stream or an iterable.
536
iterator = iter(stream_or_iterable)
538
iterator = ChunkIter(stream_or_iterable, 16384)
541
raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
546
def parse_boundary_stream(stream, max_header_size):
548
Parses one and exactly one stream that encapsulates a boundary.
550
# Stream at beginning of header, look for end of header
551
# and parse it if found. The header must fit within one
553
chunk = stream.read(max_header_size)
555
# 'find' returns the top of these four bytes, so we'll
556
# need to munch them later to prevent them from polluting
558
header_end = chunk.find('\r\n\r\n')
560
def _parse_header(line):
561
main_value_pair, params = parse_header(line)
563
name, value = main_value_pair.split(':', 1)
565
raise ValueError("Invalid header: %r" % line)
566
return name, (value, params)
569
# we find no header, so we just mark this fact and pass on
570
# the stream verbatim
572
return (RAW, {}, stream)
574
header = chunk[:header_end]
576
# here we place any excess chunk back onto the stream, as
577
# well as throwing away the CRLFCRLF bytes from above.
578
stream.unget(chunk[header_end + 4:])
583
# Eliminate blank lines
584
for line in header.split('\r\n'):
585
# This terminology ("main value" and "dictionary of
586
# parameters") is from the Python docs.
588
name, (value, params) = _parse_header(line)
592
if name == 'content-disposition':
594
if params.get('filename'):
597
outdict[name] = value, params
602
return (TYPE, outdict, stream)
604
class Parser(object):
605
def __init__(self, stream, boundary):
606
self._stream = stream
607
self._separator = '--' + boundary
610
boundarystream = InterBoundaryIter(self._stream, self._separator)
611
for sub_stream in boundarystream:
612
# Iterate over each part
613
yield parse_boundary_stream(sub_stream, 1024)
615
def parse_header(line):
616
""" Parse the header into a key-value. """
617
plist = _parse_header_params(';' + line)
618
key = plist.pop(0).lower()
623
name = p[:i].strip().lower()
624
value = p[i+1:].strip()
625
if len(value) >= 2 and value[0] == value[-1] == '"':
627
value = value.replace('\\\\', '\\').replace('\\"', '"')
631
def _parse_header_params(s):
636
while end > 0 and s.count('"', 0, end) % 2:
637
end = s.find(';', end + 1)
641
plist.append(f.strip())