diff options
author | Marc Abramowitz <marc@marc-abramowitz.com> | 2015-04-30 17:39:24 -0700 |
---|---|---|
committer | Marc Abramowitz <marc@marc-abramowitz.com> | 2015-04-30 17:39:24 -0700 |
commit | fa100c92c06d3a8a61a0dda1a2e06018437b09c6 (patch) | |
tree | a1cc50f93fbf257685c3849e03496c5e33949281 /tests/test_wsgiwrappers.py | |
download | paste-git-fa100c92c06d3a8a61a0dda1a2e06018437b09c6.tar.gz |
test_wsgirequest_charset: Use UTF-8 instead of iso-8859-1test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1
because it seems that the defacto standard for encoding URIs is to use UTF-8.
I've been reading about url encoding and it seems like perhaps using an
encoding other than UTF-8 is very non-standard and not well-supported (this
test is trying to use `iso-8859-1`).
From http://en.wikipedia.org/wiki/Percent-encoding
> For a non-ASCII character, it is typically converted to its byte sequence in
> UTF-8, and then each byte value is represented as above.
> The generic URI syntax mandates that new URI schemes that provide for the
> representation of character data in a URI must, in effect, represent
> characters from the unreserved set without translation, and should convert
> all other characters to bytes according to UTF-8, and then percent-encode
> those values. This requirement was introduced in January 2005 with the
> publication of RFC 3986
From http://tools.ietf.org/html/rfc3986:
> Non-ASCII characters must first be encoded according to UTF-8 [STD63], and
> then each octet of the corresponding UTF-8 sequence must be percent-encoded
> to be represented as URI characters. URI producing applications must not use
> percent-encoding in host unless it is used to represent a UTF-8 character
> sequence.
From http://tools.ietf.org/html/rfc3987:
> Conversions from URIs to IRIs MUST NOT use any character encoding other than
> UTF-8 in steps 3 and 4, even if it might be possible to guess from the
> context that another character encoding than UTF-8 was used in the URI. For
> example, the URI "http://www.example.org/r%E9sum%E9.html" might with some
> guessing be interpreted to contain two e-acute characters encoded as
> iso-8859-1. It must not be converted to an IRI containing these e-acute
> characters. Otherwise, in the future the IRI will be mapped to
> "http://www.example.org/r%C3%A9sum%C3%A9.html", which is a different URI from
> "http://www.example.org/r%E9sum%E9.html".
See issue #7, which I think this at least partially fixes.
Diffstat (limited to 'tests/test_wsgiwrappers.py')
-rw-r--r-- | tests/test_wsgiwrappers.py | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/tests/test_wsgiwrappers.py b/tests/test_wsgiwrappers.py new file mode 100644 index 0000000..8719693 --- /dev/null +++ b/tests/test_wsgiwrappers.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- +# (c) 2007 Philip Jenvey; written for Paste (http://pythonpaste.org) +# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php +import cgi +from paste.fixture import TestApp +from paste.wsgiwrappers import WSGIRequest, WSGIResponse +import six + +class AssertApp(object): + def __init__(self, assertfunc): + self.assertfunc = assertfunc + + def __call__(self, environ, start_response): + start_response('200 OK', [('Content-type','text/plain')]) + self.assertfunc(environ) + return ['Passed'] + +no_encoding = object() +def valid_name(name, encoding=no_encoding, post=False): + def assert_valid_name(environ): + if encoding is not no_encoding: + WSGIRequest.defaults._push_object(dict(content_type='text/html', + charset=encoding)) + try: + request = WSGIRequest(environ) + if post: + params = request.POST + else: + params = request.GET + assert params['name'] == name + assert request.params['name'] == name + finally: + if encoding is not no_encoding: + WSGIRequest.defaults._pop_object() + return assert_valid_name + +def test_wsgirequest_charset(): + # Jose, 'José' + app = TestApp(AssertApp(assertfunc=valid_name(u'José', encoding='UTF-8'))) + res = app.get('/?name=Jos%C3%A9') + + # Tanaka, '田中' + app = TestApp(AssertApp(assertfunc=valid_name(u'田中', encoding='UTF-8'))) + res = app.get('/?name=%E7%94%B0%E4%B8%AD') + + # Nippon (Japan), '日本' + app = TestApp(AssertApp(assertfunc=valid_name(u'日本', encoding='UTF-8', + post=True))) + res = app.post('/', params=dict(name='日本')) + + # WSGIRequest will determine the charset from the Content-Type header when + # unicode is expected. + # No encoding specified: not expecting unicode + app = TestApp(AssertApp(assertfunc=valid_name('日本', post=True))) + content_type = 'application/x-www-form-urlencoded; charset=%s' + res = app.post('/', params=dict(name='日本'), + headers={'content-type': content_type % 'UTF-8'}) + + # Encoding specified: expect unicode. Shiftjis is the default encoding, but + # params become UTF-8 because the browser specified so + app = TestApp(AssertApp(assertfunc=valid_name(u'日本', post=True, + encoding='shiftjis'))) + res = app.post('/', params=dict(name='日本'), + headers={'content-type': content_type % 'UTF-8'}) + + # Browser did not specify: parse params as the fallback shiftjis + app = TestApp(AssertApp(assertfunc=valid_name(u'日本', post=True, + encoding='shiftjis'))) + res = app.post('/', params=dict(name=u'日本'.encode('shiftjis'))) + +def test_wsgirequest_charset_fileupload(): + def handle_fileupload(environ, start_response): + start_response('200 OK', [('Content-type','text/plain')]) + request = WSGIRequest(environ) + + assert len(request.POST) == 1 + assert isinstance(request.POST.keys()[0], str) + fs = request.POST['thefile'] + assert isinstance(fs, cgi.FieldStorage) + assert isinstance(fs.filename, str) + assert fs.filename == '寿司.txt' + assert fs.value == b'Sushi' + + request.charset = 'UTF-8' + assert len(request.POST) == 1 + assert isinstance(request.POST.keys()[0], str) + fs = request.POST['thefile'] + assert isinstance(fs, cgi.FieldStorage) + assert isinstance(fs.filename, six.text_type) + assert fs.filename == u'寿司.txt' + assert fs.value == b'Sushi' + + request.charset = None + assert fs.value == b'Sushi' + return [] + + app = TestApp(handle_fileupload) + res = app.post('/', upload_files=[('thefile', '寿司.txt', b'Sushi')]) + +def test_wsgiresponse_charset(): + response = WSGIResponse(mimetype='text/html; charset=UTF-8') + assert response.content_type == 'text/html' + assert response.charset == 'UTF-8' + response.write(u'test') + response.write(u'test2') + response.write('test3') + status, headers, content = response.wsgi_response() + for data in content: + assert isinstance(data, six.binary_type) + + WSGIResponse.defaults._push_object(dict(content_type='text/html', + charset='iso-8859-1')) + try: + response = WSGIResponse() + response.write(u'test') + response.write(u'test2') + response.write('test3') + status, headers, content = response.wsgi_response() + for data in content: + assert isinstance(data, six.binary_type) + finally: + WSGIResponse.defaults._pop_object() + + # WSGIResponse will allow unicode to pass through when no charset is + # set + WSGIResponse.defaults._push_object(dict(content_type='text/html', + charset=None)) + try: + response = WSGIResponse(u'test') + response.write(u'test1') + status, headers, content = response.wsgi_response() + for data in content: + assert isinstance(data, six.text_type) + finally: + WSGIResponse.defaults._pop_object() + + WSGIResponse.defaults._push_object(dict(content_type='text/html', + charset='')) + try: + response = WSGIResponse(u'test') + response.write(u'test1') + status, headers, content = response.wsgi_response() + for data in content: + assert isinstance(data, six.text_type) + finally: + WSGIResponse.defaults._pop_object() |