| """ |
| Multi-part parsing for file uploads. |
| |
| Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to |
| file upload handlers for processing. |
| """ |
| from __future__ import unicode_literals |
| |
| import base64 |
| import cgi |
| |
| from django.conf import settings |
| from django.core.exceptions import SuspiciousOperation |
| from django.utils.datastructures import MultiValueDict |
| from django.utils.encoding import force_text |
| from django.utils import six |
| from django.utils.text import unescape_entities |
| from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers |
| |
| __all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted') |
| |
| class MultiPartParserError(Exception): |
| pass |
| |
| class InputStreamExhausted(Exception): |
| """ |
| No more reads are allowed from this device. |
| """ |
| pass |
| |
| RAW = "raw" |
| FILE = "file" |
| FIELD = "field" |
| |
| class MultiPartParser(object): |
| """ |
| A rfc2388 multipart/form-data parser. |
| |
| ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks |
| and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. |
| """ |
| def __init__(self, META, input_data, upload_handlers, encoding=None): |
| """ |
| Initialize the MultiPartParser object. |
| |
| :META: |
| The standard ``META`` dictionary in Django request objects. |
| :input_data: |
| The raw post data, as a file-like object. |
| :upload_handler: |
| An UploadHandler instance that performs operations on the uploaded |
| data. |
| :encoding: |
| The encoding with which to treat the incoming data. |
| """ |
| |
| # |
| # Content-Type should containt multipart and the boundary information. |
| # |
| |
| content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', '')) |
| if not content_type.startswith('multipart/'): |
| raise MultiPartParserError('Invalid Content-Type: %s' % content_type) |
| |
| # Parse the header to get the boundary to split the parts. |
| ctypes, opts = parse_header(content_type.encode('ascii')) |
| boundary = opts.get('boundary') |
| if not boundary or not cgi.valid_boundary(boundary): |
| raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary) |
| |
| # Content-Length should contain the length of the body we are about |
| # to receive. |
| try: |
| content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH', 0))) |
| except (ValueError, TypeError): |
| content_length = 0 |
| |
| if content_length < 0: |
| # This means we shouldn't continue...raise an error. |
| raise MultiPartParserError("Invalid content length: %r" % content_length) |
| |
| if isinstance(boundary, six.text_type): |
| boundary = boundary.encode('ascii') |
| self._boundary = boundary |
| self._input_data = input_data |
| |
| # For compatibility with low-level network APIs (with 32-bit integers), |
| # the chunk size should be < 2^31, but still divisible by 4. |
| possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size] |
| self._chunk_size = min([2**31-4] + possible_sizes) |
| |
| self._meta = META |
| self._encoding = encoding or settings.DEFAULT_CHARSET |
| self._content_length = content_length |
| self._upload_handlers = upload_handlers |
| |
| def parse(self): |
| """ |
| Parse the POST data and break it into a FILES MultiValueDict and a POST |
| MultiValueDict. |
| |
| Returns a tuple containing the POST and FILES dictionary, respectively. |
| """ |
| # We have to import QueryDict down here to avoid a circular import. |
| from django.http import QueryDict |
| |
| encoding = self._encoding |
| handlers = self._upload_handlers |
| |
| # HTTP spec says that Content-Length >= 0 is valid |
| # handling content-length == 0 before continuing |
| if self._content_length == 0: |
| return QueryDict('', encoding=self._encoding), MultiValueDict() |
| |
| # See if the handler will want to take care of the parsing. |
| # This allows overriding everything if somebody wants it. |
| for handler in handlers: |
| result = handler.handle_raw_input(self._input_data, |
| self._meta, |
| self._content_length, |
| self._boundary, |
| encoding) |
| if result is not None: |
| return result[0], result[1] |
| |
| # Create the data structures to be used later. |
| self._post = QueryDict('', mutable=True) |
| self._files = MultiValueDict() |
| |
| # Instantiate the parser and stream: |
| stream = LazyStream(ChunkIter(self._input_data, self._chunk_size)) |
| |
| # Whether or not to signal a file-completion at the beginning of the loop. |
| old_field_name = None |
| counters = [0] * len(handlers) |
| |
| try: |
| for item_type, meta_data, field_stream in Parser(stream, self._boundary): |
| if old_field_name: |
| # We run this at the beginning of the next loop |
| # since we cannot be sure a file is complete until |
| # we hit the next boundary/part of the multipart content. |
| self.handle_file_complete(old_field_name, counters) |
| old_field_name = None |
| |
| try: |
| disposition = meta_data['content-disposition'][1] |
| field_name = disposition['name'].strip() |
| except (KeyError, IndexError, AttributeError): |
| continue |
| |
| transfer_encoding = meta_data.get('content-transfer-encoding') |
| if transfer_encoding is not None: |
| transfer_encoding = transfer_encoding[0].strip() |
| field_name = force_text(field_name, encoding, errors='replace') |
| |
| if item_type == FIELD: |
| # This is a post field, we can just set it in the post |
| if transfer_encoding == 'base64': |
| raw_data = field_stream.read() |
| try: |
| data = str(raw_data).decode('base64') |
| except: |
| data = raw_data |
| else: |
| data = field_stream.read() |
| |
| self._post.appendlist(field_name, |
| force_text(data, encoding, errors='replace')) |
| elif item_type == FILE: |
| # This is a file, use the handler... |
| file_name = disposition.get('filename') |
| if not file_name: |
| continue |
| file_name = force_text(file_name, encoding, errors='replace') |
| file_name = self.IE_sanitize(unescape_entities(file_name)) |
| |
| content_type = meta_data.get('content-type', ('',))[0].strip() |
| try: |
| charset = meta_data.get('content-type', (0, {}))[1].get('charset', None) |
| except: |
| charset = None |
| |
| try: |
| content_length = int(meta_data.get('content-length')[0]) |
| except (IndexError, TypeError, ValueError): |
| content_length = None |
| |
| counters = [0] * len(handlers) |
| try: |
| for handler in handlers: |
| try: |
| handler.new_file(field_name, file_name, |
| content_type, content_length, |
| charset) |
| except StopFutureHandlers: |
| break |
| |
| for chunk in field_stream: |
| if transfer_encoding == 'base64': |
| # We only special-case base64 transfer encoding |
| # We should always read base64 streams by multiple of 4 |
| over_bytes = len(chunk) % 4 |
| if over_bytes: |
| over_chunk = field_stream.read(4 - over_bytes) |
| chunk += over_chunk |
| |
| try: |
| chunk = base64.b64decode(chunk) |
| except Exception as e: |
| # Since this is only a chunk, any error is an unfixable error. |
| raise MultiPartParserError("Could not decode base64 data: %r" % e) |
| |
| for i, handler in enumerate(handlers): |
| chunk_length = len(chunk) |
| chunk = handler.receive_data_chunk(chunk, |
| counters[i]) |
| counters[i] += chunk_length |
| if chunk is None: |
| # If the chunk received by the handler is None, then don't continue. |
| break |
| |
| except SkipFile: |
| # Just use up the rest of this file... |
| exhaust(field_stream) |
| else: |
| # Handle file upload completions on next iteration. |
| old_field_name = field_name |
| else: |
| # If this is neither a FIELD or a FILE, just exhaust the stream. |
| exhaust(stream) |
| except StopUpload as e: |
| if not e.connection_reset: |
| exhaust(self._input_data) |
| else: |
| # Make sure that the request data is all fed |
| exhaust(self._input_data) |
| |
| # Signal that the upload has completed. |
| for handler in handlers: |
| retval = handler.upload_complete() |
| if retval: |
| break |
| |
| return self._post, self._files |
| |
| def handle_file_complete(self, old_field_name, counters): |
| """ |
| Handle all the signalling that takes place when a file is complete. |
| """ |
| for i, handler in enumerate(self._upload_handlers): |
| file_obj = handler.file_complete(counters[i]) |
| if file_obj: |
| # If it returns a file object, then set the files dict. |
| self._files.appendlist(force_text(old_field_name, |
| self._encoding, |
| errors='replace'), |
| file_obj) |
| break |
| |
| def IE_sanitize(self, filename): |
| """Cleanup filename from Internet Explorer full paths.""" |
| return filename and filename[filename.rfind("\\")+1:].strip() |
| |
| class LazyStream(six.Iterator): |
| """ |
| The LazyStream wrapper allows one to get and "unget" bytes from a stream. |
| |
| Given a producer object (an iterator that yields bytestrings), the |
| LazyStream object will support iteration, reading, and keeping a "look-back" |
| variable in case you need to "unget" some bytes. |
| """ |
| def __init__(self, producer, length=None): |
| """ |
| Every LazyStream must have a producer when instantiated. |
| |
| A producer is an iterable that returns a string each time it |
| is called. |
| """ |
| self._producer = producer |
| self._empty = False |
| self._leftover = b'' |
| self.length = length |
| self.position = 0 |
| self._remaining = length |
| self._unget_history = [] |
| |
| def tell(self): |
| return self.position |
| |
| def read(self, size=None): |
| def parts(): |
| remaining = (size is not None and [size] or [self._remaining])[0] |
| # do the whole thing in one shot if no limit was provided. |
| if remaining is None: |
| yield b''.join(self) |
| return |
| |
| # otherwise do some bookkeeping to return exactly enough |
| # of the stream and stashing any extra content we get from |
| # the producer |
| while remaining != 0: |
| assert remaining > 0, 'remaining bytes to read should never go negative' |
| |
| chunk = next(self) |
| |
| emitting = chunk[:remaining] |
| self.unget(chunk[remaining:]) |
| remaining -= len(emitting) |
| yield emitting |
| |
| out = b''.join(parts()) |
| return out |
| |
| def __next__(self): |
| """ |
| Used when the exact number of bytes to read is unimportant. |
| |
| This procedure just returns whatever is chunk is conveniently returned |
| from the iterator instead. Useful to avoid unnecessary bookkeeping if |
| performance is an issue. |
| """ |
| if self._leftover: |
| output = self._leftover |
| self._leftover = b'' |
| else: |
| output = next(self._producer) |
| self._unget_history = [] |
| self.position += len(output) |
| return output |
| |
| def close(self): |
| """ |
| Used to invalidate/disable this lazy stream. |
| |
| Replaces the producer with an empty list. Any leftover bytes that have |
| already been read will still be reported upon read() and/or next(). |
| """ |
| self._producer = [] |
| |
| def __iter__(self): |
| return self |
| |
| def unget(self, bytes): |
| """ |
| Places bytes back onto the front of the lazy stream. |
| |
| Future calls to read() will return those bytes first. The |
| stream position and thus tell() will be rewound. |
| """ |
| if not bytes: |
| return |
| self._update_unget_history(len(bytes)) |
| self.position -= len(bytes) |
| self._leftover = b''.join([bytes, self._leftover]) |
| |
| def _update_unget_history(self, num_bytes): |
| """ |
| Updates the unget history as a sanity check to see if we've pushed |
| back the same number of bytes in one chunk. If we keep ungetting the |
| same number of bytes many times (here, 50), we're mostly likely in an |
| infinite loop of some sort. This is usually caused by a |
| maliciously-malformed MIME request. |
| """ |
| self._unget_history = [num_bytes] + self._unget_history[:49] |
| number_equal = len([current_number for current_number in self._unget_history |
| if current_number == num_bytes]) |
| |
| if number_equal > 40: |
| raise SuspiciousOperation( |
| "The multipart parser got stuck, which shouldn't happen with" |
| " normal uploaded files. Check for malicious upload activity;" |
| " if there is none, report this to the Django developers." |
| ) |
| |
| class ChunkIter(six.Iterator): |
| """ |
| An iterable that will yield chunks of data. Given a file-like object as the |
| constructor, this object will yield chunks of read operations from that |
| object. |
| """ |
| def __init__(self, flo, chunk_size=64 * 1024): |
| self.flo = flo |
| self.chunk_size = chunk_size |
| |
| def __next__(self): |
| try: |
| data = self.flo.read(self.chunk_size) |
| except InputStreamExhausted: |
| raise StopIteration() |
| if data: |
| return data |
| else: |
| raise StopIteration() |
| |
| def __iter__(self): |
| return self |
| |
| class InterBoundaryIter(six.Iterator): |
| """ |
| A Producer that will iterate over boundaries. |
| """ |
| def __init__(self, stream, boundary): |
| self._stream = stream |
| self._boundary = boundary |
| |
| def __iter__(self): |
| return self |
| |
| def __next__(self): |
| try: |
| return LazyStream(BoundaryIter(self._stream, self._boundary)) |
| except InputStreamExhausted: |
| raise StopIteration() |
| |
| class BoundaryIter(six.Iterator): |
| """ |
| A Producer that is sensitive to boundaries. |
| |
| Will happily yield bytes until a boundary is found. Will yield the bytes |
| before the boundary, throw away the boundary bytes themselves, and push the |
| post-boundary bytes back on the stream. |
| |
| The future calls to next() after locating the boundary will raise a |
| StopIteration exception. |
| """ |
| |
| def __init__(self, stream, boundary): |
| self._stream = stream |
| self._boundary = boundary |
| self._done = False |
| # rollback an additional six bytes because the format is like |
| # this: CRLF<boundary>[--CRLF] |
| self._rollback = len(boundary) + 6 |
| |
| # Try to use mx fast string search if available. Otherwise |
| # use Python find. Wrap the latter for consistency. |
| unused_char = self._stream.read(1) |
| if not unused_char: |
| raise InputStreamExhausted() |
| self._stream.unget(unused_char) |
| try: |
| from mx.TextTools import FS |
| self._fs = FS(boundary).find |
| except ImportError: |
| self._fs = lambda data: data.find(boundary) |
| |
| def __iter__(self): |
| return self |
| |
| def __next__(self): |
| if self._done: |
| raise StopIteration() |
| |
| stream = self._stream |
| rollback = self._rollback |
| |
| bytes_read = 0 |
| chunks = [] |
| for bytes in stream: |
| bytes_read += len(bytes) |
| chunks.append(bytes) |
| if bytes_read > rollback: |
| break |
| if not bytes: |
| break |
| else: |
| self._done = True |
| |
| if not chunks: |
| raise StopIteration() |
| |
| chunk = b''.join(chunks) |
| boundary = self._find_boundary(chunk, len(chunk) < self._rollback) |
| |
| if boundary: |
| end, next = boundary |
| stream.unget(chunk[next:]) |
| self._done = True |
| return chunk[:end] |
| else: |
| # make sure we dont treat a partial boundary (and |
| # its separators) as data |
| if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6): |
| # There's nothing left, we should just return and mark as done. |
| self._done = True |
| return chunk |
| else: |
| stream.unget(chunk[-rollback:]) |
| return chunk[:-rollback] |
| |
| def _find_boundary(self, data, eof = False): |
| """ |
| Finds a multipart boundary in data. |
| |
| Should no boundry exist in the data None is returned instead. Otherwise |
| a tuple containing the indices of the following are returned: |
| |
| * the end of current encapsulation |
| * the start of the next encapsulation |
| """ |
| index = self._fs(data) |
| if index < 0: |
| return None |
| else: |
| end = index |
| next = index + len(self._boundary) |
| # backup over CRLF |
| last = max(0, end-1) |
| if data[last:last+1] == b'\n': |
| end -= 1 |
| last = max(0, end-1) |
| if data[last:last+1] == b'\r': |
| end -= 1 |
| return end, next |
| |
| def exhaust(stream_or_iterable): |
| """ |
| Completely exhausts an iterator or stream. |
| |
| Raise a MultiPartParserError if the argument is not a stream or an iterable. |
| """ |
| iterator = None |
| try: |
| iterator = iter(stream_or_iterable) |
| except TypeError: |
| iterator = ChunkIter(stream_or_iterable, 16384) |
| |
| if iterator is None: |
| raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter') |
| |
| for __ in iterator: |
| pass |
| |
| def parse_boundary_stream(stream, max_header_size): |
| """ |
| Parses one and exactly one stream that encapsulates a boundary. |
| """ |
| # Stream at beginning of header, look for end of header |
| # and parse it if found. The header must fit within one |
| # chunk. |
| chunk = stream.read(max_header_size) |
| |
| # 'find' returns the top of these four bytes, so we'll |
| # need to munch them later to prevent them from polluting |
| # the payload. |
| header_end = chunk.find(b'\r\n\r\n') |
| |
| def _parse_header(line): |
| main_value_pair, params = parse_header(line) |
| try: |
| name, value = main_value_pair.split(':', 1) |
| except: |
| raise ValueError("Invalid header: %r" % line) |
| return name, (value, params) |
| |
| if header_end == -1: |
| # we find no header, so we just mark this fact and pass on |
| # the stream verbatim |
| stream.unget(chunk) |
| return (RAW, {}, stream) |
| |
| header = chunk[:header_end] |
| |
| # here we place any excess chunk back onto the stream, as |
| # well as throwing away the CRLFCRLF bytes from above. |
| stream.unget(chunk[header_end + 4:]) |
| |
| TYPE = RAW |
| outdict = {} |
| |
| # Eliminate blank lines |
| for line in header.split(b'\r\n'): |
| # This terminology ("main value" and "dictionary of |
| # parameters") is from the Python docs. |
| try: |
| name, (value, params) = _parse_header(line) |
| except: |
| continue |
| |
| if name == 'content-disposition': |
| TYPE = FIELD |
| if params.get('filename'): |
| TYPE = FILE |
| |
| outdict[name] = value, params |
| |
| if TYPE == RAW: |
| stream.unget(chunk) |
| |
| return (TYPE, outdict, stream) |
| |
| class Parser(object): |
| def __init__(self, stream, boundary): |
| self._stream = stream |
| self._separator = b'--' + boundary |
| |
| def __iter__(self): |
| boundarystream = InterBoundaryIter(self._stream, self._separator) |
| for sub_stream in boundarystream: |
| # Iterate over each part |
| yield parse_boundary_stream(sub_stream, 1024) |
| |
| def parse_header(line): |
| """ Parse the header into a key-value. |
| Input (line): bytes, output: unicode for key/name, bytes for value which |
| will be decoded later |
| """ |
| plist = _parse_header_params(b';' + line) |
| key = plist.pop(0).lower().decode('ascii') |
| pdict = {} |
| for p in plist: |
| i = p.find(b'=') |
| if i >= 0: |
| name = p[:i].strip().lower().decode('ascii') |
| value = p[i+1:].strip() |
| if len(value) >= 2 and value[:1] == value[-1:] == b'"': |
| value = value[1:-1] |
| value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"') |
| pdict[name] = value |
| return key, pdict |
| |
| def _parse_header_params(s): |
| plist = [] |
| while s[:1] == b';': |
| s = s[1:] |
| end = s.find(b';') |
| while end > 0 and s.count(b'"', 0, end) % 2: |
| end = s.find(b';', end + 1) |
| if end < 0: |
| end = len(s) |
| f = s[:end] |
| plist.append(f.strip()) |
| s = s[end:] |
| return plist |