test_wsgirequest_charset: Use UTF-8 instead of iso-8859-1test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1

because it seems that the defacto standard for encoding URIs is to use UTF-8. I've been reading about url encoding and it seems like perhaps using an encoding other than UTF-8 is very non-standard and not well-supported (this test is trying to use `iso-8859-1`). From http://en.wikipedia.org/wiki/Percent-encoding > For a non-ASCII character, it is typically converted to its byte sequence in > UTF-8, and then each byte value is represented as above. > The generic URI syntax mandates that new URI schemes that provide for the > representation of character data in a URI must, in effect, represent > characters from the unreserved set without translation, and should convert > all other characters to bytes according to UTF-8, and then percent-encode > those values. This requirement was introduced in January 2005 with the > publication of RFC 3986 From http://tools.ietf.org/html/rfc3986: > Non-ASCII characters must first be encoded according to UTF-8 [STD63], and > then each octet of the corresponding UTF-8 sequence must be percent-encoded > to be represented as URI characters. URI producing applications must not use > percent-encoding in host unless it is used to represent a UTF-8 character > sequence. From http://tools.ietf.org/html/rfc3987: > Conversions from URIs to IRIs MUST NOT use any character encoding other than > UTF-8 in steps 3 and 4, even if it might be possible to guess from the > context that another character encoding than UTF-8 was used in the URI. For > example, the URI "http://www.example.org/r%E9sum%E9.html" might with some > guessing be interpreted to contain two e-acute characters encoded as > iso-8859-1. It must not be converted to an IRI containing these e-acute > characters. Otherwise, in the future the IRI will be mapped to > "http://www.example.org/r%C3%A9sum%C3%A9.html", which is a different URI from > "http://www.example.org/r%E9sum%E9.html". See issue #7, which I think this at least partially fixes.
author: Marc Abramowitz <marc@marc-abramowitz.com> 2015-04-30 17:39:24 -0700
committer: Marc Abramowitz <marc@marc-abramowitz.com> 2015-04-30 17:39:24 -0700
commit: fa100c92c06d3a8a61a0dda1a2e06018437b09c6 (patch)
tree: a1cc50f93fbf257685c3849e03496c5e33949281 /paste/exceptions/serial_number_generator.py
download: paste-git-test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1.tar.gz
1 files changed, 129 insertions, 0 deletions
diff --git a/paste/exceptions/serial_number_generator.py b/paste/exceptions/serial_number_generator.py
new file mode 100644
index 0000000..3f80107
--- /dev/null
+++ b/paste/exceptions/serial_number_generator.py
@@ -0,0 +1,129 @@
+# (c) 2005 Ian Bicking and contributors; written for Paste (http://pythonpaste.org)
+# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php
+
+"""
+Creates a human-readable identifier, using numbers and digits,
+avoiding ambiguous numbers and letters.  hash_identifier can be used
+to create compact representations that are unique for a certain string
+(or concatenation of strings)
+"""
+
+try:
+    from hashlib import md5
+except ImportError:
+    from md5 import md5
+
+import six
+
+good_characters = "23456789abcdefghjkmnpqrtuvwxyz"
+
+base = len(good_characters)
+
+def make_identifier(number):
+    """
+    Encodes a number as an identifier.
+    """
+    if not isinstance(number, six.integer_types):
+        raise ValueError(
+            "You can only make identifiers out of integers (not %r)"
+            % number)
+    if number < 0:
+        raise ValueError(
+            "You cannot make identifiers out of negative numbers: %r"
+            % number)
+    result = []
+    while number:
+        next = number % base
+        result.append(good_characters[next])
+        # Note, this depends on integer rounding of results:
+        number = number // base
+    return ''.join(result)
+
+def hash_identifier(s, length, pad=True, hasher=md5, prefix='',
+                    group=None, upper=False):
+    """
+    Hashes the string (with the given hashing module), then turns that
+    hash into an identifier of the given length (using modulo to
+    reduce the length of the identifier).  If ``pad`` is False, then
+    the minimum-length identifier will be used; otherwise the
+    identifier will be padded with 0's as necessary.
+
+    ``prefix`` will be added last, and does not count towards the
+    target length.  ``group`` will group the characters with ``-`` in
+    the given lengths, and also does not count towards the target
+    length.  E.g., ``group=4`` will cause a identifier like
+    ``a5f3-hgk3-asdf``.  Grouping occurs before the prefix.
+    """
+    if not callable(hasher):
+        # Accept sha/md5 modules as well as callables
+        hasher = hasher.new
+    if length > 26 and hasher is md5:
+        raise ValueError(
+            "md5 cannot create hashes longer than 26 characters in "
+            "length (you gave %s)" % length)
+    if isinstance(s, six.text_type):
+        s = s.encode('utf-8')
+    elif not isinstance(s, six.binary_type):
+        s = str(s)
+        if six.PY3:
+            s = s.encode('utf-8')
+    h = hasher(s)
+    bin_hash = h.digest()
+    modulo = base ** length
+    number = 0
+    for c in list(bin_hash):
+        number = (number * 256 + six.byte2int([c])) % modulo
+    ident = make_identifier(number)
+    if pad:
+        ident = good_characters[0]*(length-len(ident)) + ident
+    if group:
+        parts = []
+        while ident:
+            parts.insert(0, ident[-group:])
+            ident = ident[:-group]
+        ident = '-'.join(parts)
+    if upper:
+        ident = ident.upper()
+    return prefix + ident
+
+# doctest tests:
+__test__ = {
+    'make_identifier': """
+    >>> make_identifier(0)
+    ''
+    >>> make_identifier(1000)
+    'c53'
+    >>> make_identifier(-100)
+    Traceback (most recent call last):
+        ...
+    ValueError: You cannot make identifiers out of negative numbers: -100
+    >>> make_identifier('test')
+    Traceback (most recent call last):
+        ...
+    ValueError: You can only make identifiers out of integers (not 'test')
+    >>> make_identifier(1000000000000)
+    'c53x9rqh3'
+    """,
+    'hash_identifier': """
+    >>> hash_identifier(0, 5)
+    'cy2dr'
+    >>> hash_identifier(0, 10)
+    'cy2dr6rg46'
+    >>> hash_identifier('this is a test of a long string', 5)
+    'awatu'
+    >>> hash_identifier(0, 26)
+    'cy2dr6rg46cx8t4w2f3nfexzk4'
+    >>> hash_identifier(0, 30)
+    Traceback (most recent call last):
+        ...
+    ValueError: md5 cannot create hashes longer than 26 characters in length (you gave 30)
+    >>> hash_identifier(0, 10, group=4)
+    'cy-2dr6-rg46'
+    >>> hash_identifier(0, 10, group=4, upper=True, prefix='M-')
+    'M-CY-2DR6-RG46'
+    """}
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
+
author	Marc Abramowitz <marc@marc-abramowitz.com>	2015-04-30 17:39:24 -0700
committer	Marc Abramowitz <marc@marc-abramowitz.com>	2015-04-30 17:39:24 -0700
commit	fa100c92c06d3a8a61a0dda1a2e06018437b09c6 (patch)
tree	a1cc50f93fbf257685c3849e03496c5e33949281 /paste/exceptions/serial_number_generator.py
download	paste-git-test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1.tar.gz