diff options
Diffstat (limited to 'cherrypy/lib/encoding.py')
-rw-r--r-- | cherrypy/lib/encoding.py | 320 |
1 files changed, 158 insertions, 162 deletions
diff --git a/cherrypy/lib/encoding.py b/cherrypy/lib/encoding.py index 9d4db1bb..9ab1d2f0 100644 --- a/cherrypy/lib/encoding.py +++ b/cherrypy/lib/encoding.py @@ -1,174 +1,164 @@ import struct import time +import types import cherrypy +from cherrypy.lib import file_generator from cherrypy.lib import set_vary_header -def decode(encoding=None, default_encoding='utf-8'): - """Decode cherrypy.request.params from str to unicode objects.""" - if not encoding: - ct = cherrypy.request.headers.elements("Content-Type") - if ct: - ct = ct[0] - encoding = ct.params.get("charset", None) - if (not encoding) and ct.value.lower().startswith("text/"): - # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1 - # When no explicit charset parameter is provided by the - # sender, media subtypes of the "text" type are defined - # to have a default charset value of "ISO-8859-1" when - # received via HTTP. - encoding = "ISO-8859-1" - - if not encoding: - encoding = default_encoding - - try: - decode_params(encoding) - except UnicodeDecodeError: - # IE and Firefox don't supply a charset when submitting form - # params with a CT of application/x-www-form-urlencoded. - # So after all our guessing, it could *still* be wrong. - # Start over with ISO-8859-1, since that seems to be preferred. - decode_params("ISO-8859-1") -def decode_params(encoding): - decoded_params = {} - for key, value in cherrypy.request.params.items(): - if not hasattr(value, 'file'): - # Skip the value if it is an uploaded file - if isinstance(value, list): - # value is a list: decode each element - value = [v.decode(encoding) if isinstance(v, str) else v for v in value] - elif isinstance(value, str): - # value is a regular string: decode it - value = value.decode(encoding) - decoded_params[key] = value +class ResponseEncoder: - # Decode all or nothing, so we can try again on error. - cherrypy.request.params = decoded_params - - -# Encoding - -def encode(encoding=None, errors='strict', text_only=True, add_charset=True): - # Guard against running twice - if getattr(cherrypy.request, "_encoding_attempted", False): - return - cherrypy.request._encoding_attempted = True + default_encoding = 'utf-8' + failmsg = "Response body could not be encoded with %r." + encoding = None + errors = 'strict' + text_only = True + add_charset = True - ct = cherrypy.response.headers.elements("Content-Type") - if ct: - ct = ct[0] - if (not text_only) or ct.value.lower().startswith("text/"): - # Set "charset=..." param on response Content-Type header - ct.params['charset'] = find_acceptable_charset(encoding, errors=errors) - if add_charset: - cherrypy.response.headers["Content-Type"] = str(ct) - -def encode_stream(encoding, errors='strict'): - """Encode a streaming response body. + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + self.attempted_charsets = set() + + if cherrypy.request.handler is not None: + # Replace request.handler with self + self.oldhandler = cherrypy.request.handler + cherrypy.request.handler = self - Use a generator wrapper, and just pray it works as the stream is - being written out. - """ - def encoder(body): - for chunk in body: - if isinstance(chunk, unicode): - chunk = chunk.encode(encoding, errors) - yield chunk - cherrypy.response.body = encoder(cherrypy.response.body) - return True - -def encode_string(encoding, errors='strict'): - """Encode a buffered response body.""" - try: - body = [] - for chunk in cherrypy.response.body: - if isinstance(chunk, unicode): - chunk = chunk.encode(encoding, errors) - body.append(chunk) - cherrypy.response.body = body - except (LookupError, UnicodeError): - return False - else: + def encode_stream(self, encoding): + """Encode a streaming response body. + + Use a generator wrapper, and just pray it works as the stream is + being written out. + """ + if encoding in self.attempted_charsets: + return False + self.attempted_charsets.add(encoding) + + def encoder(body): + for chunk in body: + if isinstance(chunk, unicode): + chunk = chunk.encode(encoding, self.errors) + yield chunk + self.body = encoder(self.body) return True - -def find_acceptable_charset(encoding=None, default_encoding='utf-8', errors='strict'): - response = cherrypy.response - if cherrypy.response.stream: - encoder = encode_stream - else: - response.collapse_body() - encoder = encode_string - if response.headers.has_key("Content-Length"): - # Delete Content-Length header so finalize() recalcs it. - # Encoded strings may be of different lengths from their - # unicode equivalents, and even from each other. For example: - # >>> t = u"\u7007\u3040" - # >>> len(t) - # 2 - # >>> len(t.encode("UTF-8")) - # 6 - # >>> len(t.encode("utf7")) - # 8 - del response.headers["Content-Length"] - - # Parse the Accept-Charset request header, and try to provide one - # of the requested charsets (in order of user preference). - encs = cherrypy.request.headers.elements('Accept-Charset') - charsets = [enc.value.lower() for enc in encs] - attempted_charsets = [] + def encode_string(self, encoding): + """Encode a buffered response body.""" + if encoding in self.attempted_charsets: + return False + self.attempted_charsets.add(encoding) + + try: + body = [] + for chunk in self.body: + if isinstance(chunk, unicode): + chunk = chunk.encode(encoding, self.errors) + body.append(chunk) + self.body = body + except (LookupError, UnicodeError): + return False + else: + return True - if encoding is not None: - # If specified, force this encoding to be used, or fail. - encoding = encoding.lower() - if (not charsets) or "*" in charsets or encoding in charsets: - if encoder(encoding, errors): - return encoding - else: - if not encs: - # Any character-set is acceptable. - if encoder(default_encoding, errors): - return default_encoding - else: - raise cherrypy.HTTPError(500, failmsg % default_encoding) + def find_acceptable_charset(self): + response = cherrypy.response + + if cherrypy.response.stream: + encoder = self.encode_stream else: - if "*" not in charsets: - # If no "*" is present in an Accept-Charset field, then all - # character sets not explicitly mentioned get a quality - # value of 0, except for ISO-8859-1, which gets a quality - # value of 1 if not explicitly mentioned. - iso = 'iso-8859-1' - if iso not in charsets: - attempted_charsets.append(iso) - if encoder(iso, errors): - return iso - - for element in encs: - if element.qvalue > 0: - if element.value == "*": - # Matches any charset. Try our default. - if default_encoding not in attempted_charsets: - attempted_charsets.append(default_encoding) - if encoder(default_encoding, errors): - return default_encoding - else: - encoding = element.value - if encoding not in attempted_charsets: - attempted_charsets.append(encoding) - if encoder(encoding, errors): + encoder = self.encode_string + if "Content-Length" in response.headers: + # Delete Content-Length header so finalize() recalcs it. + # Encoded strings may be of different lengths from their + # unicode equivalents, and even from each other. For example: + # >>> t = u"\u7007\u3040" + # >>> len(t) + # 2 + # >>> len(t.encode("UTF-8")) + # 6 + # >>> len(t.encode("utf7")) + # 8 + del response.headers["Content-Length"] + + # Parse the Accept-Charset request header, and try to provide one + # of the requested charsets (in order of user preference). + encs = cherrypy.request.headers.elements('Accept-Charset') + charsets = [enc.value.lower() for enc in encs] + + if self.encoding is not None: + # If specified, force this encoding to be used, or fail. + encoding = self.encoding.lower() + if (not charsets) or "*" in charsets or encoding in charsets: + if encoder(encoding): + return encoding + else: + if not encs: + # Any character-set is acceptable. + if encoder(self.default_encoding): + return self.default_encoding + else: + raise cherrypy.HTTPError(500, self.failmsg % self.default_encoding) + else: + if "*" not in charsets: + # If no "*" is present in an Accept-Charset field, then all + # character sets not explicitly mentioned get a quality + # value of 0, except for ISO-8859-1, which gets a quality + # value of 1 if not explicitly mentioned. + iso = 'iso-8859-1' + if iso not in charsets: + if encoder(iso): + return iso + + for element in encs: + if element.qvalue > 0: + if element.value == "*": + # Matches any charset. Try our default. + if encoder(self.default_encoding): + return self.default_encoding + else: + encoding = element.value + if encoder(encoding): return encoding + + # No suitable encoding found. + ac = cherrypy.request.headers.get('Accept-Charset') + if ac is None: + msg = "Your client did not send an Accept-Charset header." + else: + msg = "Your client sent this Accept-Charset header: %s." % ac + msg += " We tried these charsets: %s." % ", ".join(self.attempted_charsets) + raise cherrypy.HTTPError(406, msg) - # No suitable encoding found. - ac = cherrypy.request.headers.get('Accept-Charset') - if ac is None: - msg = "Your client did not send an Accept-Charset header." - else: - msg = "Your client sent this Accept-Charset header: %s." % ac - msg += " We tried these charsets: %s." % ", ".join(attempted_charsets) - raise cherrypy.HTTPError(406, msg) - + def __call__(self, *args, **kwargs): + self.body = self.oldhandler(*args, **kwargs) + + if isinstance(self.body, basestring): + # strings get wrapped in a list because iterating over a single + # item list is much faster than iterating over every character + # in a long string. + if self.body: + self.body = [self.body] + else: + # [''] doesn't evaluate to False, so replace it with []. + self.body = [] + elif isinstance(self.body, types.FileType): + self.body = file_generator(self.body) + elif self.body is None: + self.body = [] + + ct = cherrypy.response.headers.elements("Content-Type") + if ct: + ct = ct[0] + if (not self.text_only) or ct.value.lower().startswith("text/"): + # Set "charset=..." param on response Content-Type header + ct.params['charset'] = self.find_acceptable_charset() + if self.add_charset: + cherrypy.response.headers["Content-Type"] = str(ct) + + return self.body # GZIP @@ -176,12 +166,14 @@ def compress(body, compress_level): """Compress 'body' at the given compress_level.""" import zlib - yield '\037\213' # magic header - yield '\010' # compression method - yield '\0' - yield struct.pack("<L", long(time.time())) - yield '\002' - yield '\377' + # See http://www.gzip.org/zlib/rfc-gzip.html + yield '\x1f\x8b' # ID1 and ID2: gzip marker + yield '\x08' # CM: compression method + yield '\x00' # FLG: none set + # MTIME: 4 bytes + yield struct.pack("<L", int(time.time()) & 0xFFFFFFFFL) + yield '\x02' # XFL: max compression, slowest algo + yield '\xff' # OS: unknown crc = zlib.crc32("") size = 0 @@ -193,9 +185,13 @@ def compress(body, compress_level): crc = zlib.crc32(line, crc) yield zobj.compress(line) yield zobj.flush() - yield struct.pack("<l", crc) + + # CRC32: 4 bytes + yield struct.pack("<L", crc & 0xFFFFFFFFL) + # ISIZE: 4 bytes yield struct.pack("<L", size & 0xFFFFFFFFL) + def decompress(body): import gzip, StringIO |