summaryrefslogtreecommitdiff
path: root/cherrypy/lib/encoding.py
diff options
context:
space:
mode:
Diffstat (limited to 'cherrypy/lib/encoding.py')
-rw-r--r--cherrypy/lib/encoding.py320
1 files changed, 158 insertions, 162 deletions
diff --git a/cherrypy/lib/encoding.py b/cherrypy/lib/encoding.py
index 9d4db1bb..9ab1d2f0 100644
--- a/cherrypy/lib/encoding.py
+++ b/cherrypy/lib/encoding.py
@@ -1,174 +1,164 @@
import struct
import time
+import types
import cherrypy
+from cherrypy.lib import file_generator
from cherrypy.lib import set_vary_header
-def decode(encoding=None, default_encoding='utf-8'):
- """Decode cherrypy.request.params from str to unicode objects."""
- if not encoding:
- ct = cherrypy.request.headers.elements("Content-Type")
- if ct:
- ct = ct[0]
- encoding = ct.params.get("charset", None)
- if (not encoding) and ct.value.lower().startswith("text/"):
- # http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
- # When no explicit charset parameter is provided by the
- # sender, media subtypes of the "text" type are defined
- # to have a default charset value of "ISO-8859-1" when
- # received via HTTP.
- encoding = "ISO-8859-1"
-
- if not encoding:
- encoding = default_encoding
-
- try:
- decode_params(encoding)
- except UnicodeDecodeError:
- # IE and Firefox don't supply a charset when submitting form
- # params with a CT of application/x-www-form-urlencoded.
- # So after all our guessing, it could *still* be wrong.
- # Start over with ISO-8859-1, since that seems to be preferred.
- decode_params("ISO-8859-1")
-def decode_params(encoding):
- decoded_params = {}
- for key, value in cherrypy.request.params.items():
- if not hasattr(value, 'file'):
- # Skip the value if it is an uploaded file
- if isinstance(value, list):
- # value is a list: decode each element
- value = [v.decode(encoding) if isinstance(v, str) else v for v in value]
- elif isinstance(value, str):
- # value is a regular string: decode it
- value = value.decode(encoding)
- decoded_params[key] = value
+class ResponseEncoder:
- # Decode all or nothing, so we can try again on error.
- cherrypy.request.params = decoded_params
-
-
-# Encoding
-
-def encode(encoding=None, errors='strict', text_only=True, add_charset=True):
- # Guard against running twice
- if getattr(cherrypy.request, "_encoding_attempted", False):
- return
- cherrypy.request._encoding_attempted = True
+ default_encoding = 'utf-8'
+ failmsg = "Response body could not be encoded with %r."
+ encoding = None
+ errors = 'strict'
+ text_only = True
+ add_charset = True
- ct = cherrypy.response.headers.elements("Content-Type")
- if ct:
- ct = ct[0]
- if (not text_only) or ct.value.lower().startswith("text/"):
- # Set "charset=..." param on response Content-Type header
- ct.params['charset'] = find_acceptable_charset(encoding, errors=errors)
- if add_charset:
- cherrypy.response.headers["Content-Type"] = str(ct)
-
-def encode_stream(encoding, errors='strict'):
- """Encode a streaming response body.
+ def __init__(self, **kwargs):
+ for k, v in kwargs.items():
+ setattr(self, k, v)
+
+ self.attempted_charsets = set()
+
+ if cherrypy.request.handler is not None:
+ # Replace request.handler with self
+ self.oldhandler = cherrypy.request.handler
+ cherrypy.request.handler = self
- Use a generator wrapper, and just pray it works as the stream is
- being written out.
- """
- def encoder(body):
- for chunk in body:
- if isinstance(chunk, unicode):
- chunk = chunk.encode(encoding, errors)
- yield chunk
- cherrypy.response.body = encoder(cherrypy.response.body)
- return True
-
-def encode_string(encoding, errors='strict'):
- """Encode a buffered response body."""
- try:
- body = []
- for chunk in cherrypy.response.body:
- if isinstance(chunk, unicode):
- chunk = chunk.encode(encoding, errors)
- body.append(chunk)
- cherrypy.response.body = body
- except (LookupError, UnicodeError):
- return False
- else:
+ def encode_stream(self, encoding):
+ """Encode a streaming response body.
+
+ Use a generator wrapper, and just pray it works as the stream is
+ being written out.
+ """
+ if encoding in self.attempted_charsets:
+ return False
+ self.attempted_charsets.add(encoding)
+
+ def encoder(body):
+ for chunk in body:
+ if isinstance(chunk, unicode):
+ chunk = chunk.encode(encoding, self.errors)
+ yield chunk
+ self.body = encoder(self.body)
return True
-
-def find_acceptable_charset(encoding=None, default_encoding='utf-8', errors='strict'):
- response = cherrypy.response
- if cherrypy.response.stream:
- encoder = encode_stream
- else:
- response.collapse_body()
- encoder = encode_string
- if response.headers.has_key("Content-Length"):
- # Delete Content-Length header so finalize() recalcs it.
- # Encoded strings may be of different lengths from their
- # unicode equivalents, and even from each other. For example:
- # >>> t = u"\u7007\u3040"
- # >>> len(t)
- # 2
- # >>> len(t.encode("UTF-8"))
- # 6
- # >>> len(t.encode("utf7"))
- # 8
- del response.headers["Content-Length"]
-
- # Parse the Accept-Charset request header, and try to provide one
- # of the requested charsets (in order of user preference).
- encs = cherrypy.request.headers.elements('Accept-Charset')
- charsets = [enc.value.lower() for enc in encs]
- attempted_charsets = []
+ def encode_string(self, encoding):
+ """Encode a buffered response body."""
+ if encoding in self.attempted_charsets:
+ return False
+ self.attempted_charsets.add(encoding)
+
+ try:
+ body = []
+ for chunk in self.body:
+ if isinstance(chunk, unicode):
+ chunk = chunk.encode(encoding, self.errors)
+ body.append(chunk)
+ self.body = body
+ except (LookupError, UnicodeError):
+ return False
+ else:
+ return True
- if encoding is not None:
- # If specified, force this encoding to be used, or fail.
- encoding = encoding.lower()
- if (not charsets) or "*" in charsets or encoding in charsets:
- if encoder(encoding, errors):
- return encoding
- else:
- if not encs:
- # Any character-set is acceptable.
- if encoder(default_encoding, errors):
- return default_encoding
- else:
- raise cherrypy.HTTPError(500, failmsg % default_encoding)
+ def find_acceptable_charset(self):
+ response = cherrypy.response
+
+ if cherrypy.response.stream:
+ encoder = self.encode_stream
else:
- if "*" not in charsets:
- # If no "*" is present in an Accept-Charset field, then all
- # character sets not explicitly mentioned get a quality
- # value of 0, except for ISO-8859-1, which gets a quality
- # value of 1 if not explicitly mentioned.
- iso = 'iso-8859-1'
- if iso not in charsets:
- attempted_charsets.append(iso)
- if encoder(iso, errors):
- return iso
-
- for element in encs:
- if element.qvalue > 0:
- if element.value == "*":
- # Matches any charset. Try our default.
- if default_encoding not in attempted_charsets:
- attempted_charsets.append(default_encoding)
- if encoder(default_encoding, errors):
- return default_encoding
- else:
- encoding = element.value
- if encoding not in attempted_charsets:
- attempted_charsets.append(encoding)
- if encoder(encoding, errors):
+ encoder = self.encode_string
+ if "Content-Length" in response.headers:
+ # Delete Content-Length header so finalize() recalcs it.
+ # Encoded strings may be of different lengths from their
+ # unicode equivalents, and even from each other. For example:
+ # >>> t = u"\u7007\u3040"
+ # >>> len(t)
+ # 2
+ # >>> len(t.encode("UTF-8"))
+ # 6
+ # >>> len(t.encode("utf7"))
+ # 8
+ del response.headers["Content-Length"]
+
+ # Parse the Accept-Charset request header, and try to provide one
+ # of the requested charsets (in order of user preference).
+ encs = cherrypy.request.headers.elements('Accept-Charset')
+ charsets = [enc.value.lower() for enc in encs]
+
+ if self.encoding is not None:
+ # If specified, force this encoding to be used, or fail.
+ encoding = self.encoding.lower()
+ if (not charsets) or "*" in charsets or encoding in charsets:
+ if encoder(encoding):
+ return encoding
+ else:
+ if not encs:
+ # Any character-set is acceptable.
+ if encoder(self.default_encoding):
+ return self.default_encoding
+ else:
+ raise cherrypy.HTTPError(500, self.failmsg % self.default_encoding)
+ else:
+ if "*" not in charsets:
+ # If no "*" is present in an Accept-Charset field, then all
+ # character sets not explicitly mentioned get a quality
+ # value of 0, except for ISO-8859-1, which gets a quality
+ # value of 1 if not explicitly mentioned.
+ iso = 'iso-8859-1'
+ if iso not in charsets:
+ if encoder(iso):
+ return iso
+
+ for element in encs:
+ if element.qvalue > 0:
+ if element.value == "*":
+ # Matches any charset. Try our default.
+ if encoder(self.default_encoding):
+ return self.default_encoding
+ else:
+ encoding = element.value
+ if encoder(encoding):
return encoding
+
+ # No suitable encoding found.
+ ac = cherrypy.request.headers.get('Accept-Charset')
+ if ac is None:
+ msg = "Your client did not send an Accept-Charset header."
+ else:
+ msg = "Your client sent this Accept-Charset header: %s." % ac
+ msg += " We tried these charsets: %s." % ", ".join(self.attempted_charsets)
+ raise cherrypy.HTTPError(406, msg)
- # No suitable encoding found.
- ac = cherrypy.request.headers.get('Accept-Charset')
- if ac is None:
- msg = "Your client did not send an Accept-Charset header."
- else:
- msg = "Your client sent this Accept-Charset header: %s." % ac
- msg += " We tried these charsets: %s." % ", ".join(attempted_charsets)
- raise cherrypy.HTTPError(406, msg)
-
+ def __call__(self, *args, **kwargs):
+ self.body = self.oldhandler(*args, **kwargs)
+
+ if isinstance(self.body, basestring):
+ # strings get wrapped in a list because iterating over a single
+ # item list is much faster than iterating over every character
+ # in a long string.
+ if self.body:
+ self.body = [self.body]
+ else:
+ # [''] doesn't evaluate to False, so replace it with [].
+ self.body = []
+ elif isinstance(self.body, types.FileType):
+ self.body = file_generator(self.body)
+ elif self.body is None:
+ self.body = []
+
+ ct = cherrypy.response.headers.elements("Content-Type")
+ if ct:
+ ct = ct[0]
+ if (not self.text_only) or ct.value.lower().startswith("text/"):
+ # Set "charset=..." param on response Content-Type header
+ ct.params['charset'] = self.find_acceptable_charset()
+ if self.add_charset:
+ cherrypy.response.headers["Content-Type"] = str(ct)
+
+ return self.body
# GZIP
@@ -176,12 +166,14 @@ def compress(body, compress_level):
"""Compress 'body' at the given compress_level."""
import zlib
- yield '\037\213' # magic header
- yield '\010' # compression method
- yield '\0'
- yield struct.pack("<L", long(time.time()))
- yield '\002'
- yield '\377'
+ # See http://www.gzip.org/zlib/rfc-gzip.html
+ yield '\x1f\x8b' # ID1 and ID2: gzip marker
+ yield '\x08' # CM: compression method
+ yield '\x00' # FLG: none set
+ # MTIME: 4 bytes
+ yield struct.pack("<L", int(time.time()) & 0xFFFFFFFFL)
+ yield '\x02' # XFL: max compression, slowest algo
+ yield '\xff' # OS: unknown
crc = zlib.crc32("")
size = 0
@@ -193,9 +185,13 @@ def compress(body, compress_level):
crc = zlib.crc32(line, crc)
yield zobj.compress(line)
yield zobj.flush()
- yield struct.pack("<l", crc)
+
+ # CRC32: 4 bytes
+ yield struct.pack("<L", crc & 0xFFFFFFFFL)
+ # ISIZE: 4 bytes
yield struct.pack("<L", size & 0xFFFFFFFFL)
+
def decompress(body):
import gzip, StringIO