diff options
author | Marc Abramowitz <marc@marc-abramowitz.com> | 2015-04-30 17:39:24 -0700 |
---|---|---|
committer | Marc Abramowitz <marc@marc-abramowitz.com> | 2015-04-30 17:39:24 -0700 |
commit | fa100c92c06d3a8a61a0dda1a2e06018437b09c6 (patch) | |
tree | a1cc50f93fbf257685c3849e03496c5e33949281 /paste/util/quoting.py | |
download | paste-git-test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1.tar.gz |
test_wsgirequest_charset: Use UTF-8 instead of iso-8859-1test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1
because it seems that the defacto standard for encoding URIs is to use UTF-8.
I've been reading about url encoding and it seems like perhaps using an
encoding other than UTF-8 is very non-standard and not well-supported (this
test is trying to use `iso-8859-1`).
From http://en.wikipedia.org/wiki/Percent-encoding
> For a non-ASCII character, it is typically converted to its byte sequence in
> UTF-8, and then each byte value is represented as above.
> The generic URI syntax mandates that new URI schemes that provide for the
> representation of character data in a URI must, in effect, represent
> characters from the unreserved set without translation, and should convert
> all other characters to bytes according to UTF-8, and then percent-encode
> those values. This requirement was introduced in January 2005 with the
> publication of RFC 3986
From http://tools.ietf.org/html/rfc3986:
> Non-ASCII characters must first be encoded according to UTF-8 [STD63], and
> then each octet of the corresponding UTF-8 sequence must be percent-encoded
> to be represented as URI characters. URI producing applications must not use
> percent-encoding in host unless it is used to represent a UTF-8 character
> sequence.
From http://tools.ietf.org/html/rfc3987:
> Conversions from URIs to IRIs MUST NOT use any character encoding other than
> UTF-8 in steps 3 and 4, even if it might be possible to guess from the
> context that another character encoding than UTF-8 was used in the URI. For
> example, the URI "http://www.example.org/r%E9sum%E9.html" might with some
> guessing be interpreted to contain two e-acute characters encoded as
> iso-8859-1. It must not be converted to an IRI containing these e-acute
> characters. Otherwise, in the future the IRI will be mapped to
> "http://www.example.org/r%C3%A9sum%C3%A9.html", which is a different URI from
> "http://www.example.org/r%E9sum%E9.html".
See issue #7, which I think this at least partially fixes.
Diffstat (limited to 'paste/util/quoting.py')
-rw-r--r-- | paste/util/quoting.py | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/paste/util/quoting.py b/paste/util/quoting.py new file mode 100644 index 0000000..df0d9da --- /dev/null +++ b/paste/util/quoting.py @@ -0,0 +1,85 @@ +# (c) 2005 Ian Bicking and contributors; written for Paste (http://pythonpaste.org) +# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php + +import cgi +import six +import re +from six.moves import html_entities +from six.moves.urllib.parse import quote, unquote + + +__all__ = ['html_quote', 'html_unquote', 'url_quote', 'url_unquote', + 'strip_html'] + +default_encoding = 'UTF-8' + +def html_quote(v, encoding=None): + r""" + Quote the value (turned to a string) as HTML. This quotes <, >, + and quotes: + """ + encoding = encoding or default_encoding + if v is None: + return '' + elif isinstance(v, six.binary_type): + return cgi.escape(v, 1) + elif isinstance(v, six.text_type): + if six.PY3: + return cgi.escape(v, 1) + else: + return cgi.escape(v.encode(encoding), 1) + else: + if six.PY3: + return cgi.escape(six.text_type(v), 1) + else: + return cgi.escape(six.text_type(v).encode(encoding), 1) + +_unquote_re = re.compile(r'&([a-zA-Z]+);') +def _entity_subber(match, name2c=html_entities.name2codepoint): + code = name2c.get(match.group(1)) + if code: + return six.unichr(code) + else: + return match.group(0) + +def html_unquote(s, encoding=None): + r""" + Decode the value. + + """ + if isinstance(s, six.binary_type): + s = s.decode(encoding or default_encoding) + return _unquote_re.sub(_entity_subber, s) + +def strip_html(s): + # should this use html_unquote? + s = re.sub('<.*?>', '', s) + s = html_unquote(s) + return s + +def no_quote(s): + """ + Quoting that doesn't do anything + """ + return s + +_comment_quote_re = re.compile(r'\-\s*\>') +# Everything but \r, \n, \t: +_bad_chars_re = re.compile('[\x00-\x08\x0b-\x0c\x0e-\x1f]') +def comment_quote(s): + """ + Quote that makes sure text can't escape a comment + """ + comment = str(s) + #comment = _bad_chars_re.sub('', comment) + #print('in ', repr(str(s))) + #print('out', repr(comment)) + comment = _comment_quote_re.sub('->', comment) + return comment + +url_quote = quote +url_unquote = unquote + +if __name__ == '__main__': + import doctest + doctest.testmod() |