diff options
author | Marc Abramowitz <marc@marc-abramowitz.com> | 2015-04-30 17:39:24 -0700 |
---|---|---|
committer | Marc Abramowitz <marc@marc-abramowitz.com> | 2015-04-30 17:39:24 -0700 |
commit | fa100c92c06d3a8a61a0dda1a2e06018437b09c6 (patch) | |
tree | a1cc50f93fbf257685c3849e03496c5e33949281 /paste/exceptions/serial_number_generator.py | |
download | paste-git-test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1.tar.gz |
test_wsgirequest_charset: Use UTF-8 instead of iso-8859-1test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1
because it seems that the defacto standard for encoding URIs is to use UTF-8.
I've been reading about url encoding and it seems like perhaps using an
encoding other than UTF-8 is very non-standard and not well-supported (this
test is trying to use `iso-8859-1`).
From http://en.wikipedia.org/wiki/Percent-encoding
> For a non-ASCII character, it is typically converted to its byte sequence in
> UTF-8, and then each byte value is represented as above.
> The generic URI syntax mandates that new URI schemes that provide for the
> representation of character data in a URI must, in effect, represent
> characters from the unreserved set without translation, and should convert
> all other characters to bytes according to UTF-8, and then percent-encode
> those values. This requirement was introduced in January 2005 with the
> publication of RFC 3986
From http://tools.ietf.org/html/rfc3986:
> Non-ASCII characters must first be encoded according to UTF-8 [STD63], and
> then each octet of the corresponding UTF-8 sequence must be percent-encoded
> to be represented as URI characters. URI producing applications must not use
> percent-encoding in host unless it is used to represent a UTF-8 character
> sequence.
From http://tools.ietf.org/html/rfc3987:
> Conversions from URIs to IRIs MUST NOT use any character encoding other than
> UTF-8 in steps 3 and 4, even if it might be possible to guess from the
> context that another character encoding than UTF-8 was used in the URI. For
> example, the URI "http://www.example.org/r%E9sum%E9.html" might with some
> guessing be interpreted to contain two e-acute characters encoded as
> iso-8859-1. It must not be converted to an IRI containing these e-acute
> characters. Otherwise, in the future the IRI will be mapped to
> "http://www.example.org/r%C3%A9sum%C3%A9.html", which is a different URI from
> "http://www.example.org/r%E9sum%E9.html".
See issue #7, which I think this at least partially fixes.
Diffstat (limited to 'paste/exceptions/serial_number_generator.py')
-rw-r--r-- | paste/exceptions/serial_number_generator.py | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/paste/exceptions/serial_number_generator.py b/paste/exceptions/serial_number_generator.py new file mode 100644 index 0000000..3f80107 --- /dev/null +++ b/paste/exceptions/serial_number_generator.py @@ -0,0 +1,129 @@ +# (c) 2005 Ian Bicking and contributors; written for Paste (http://pythonpaste.org) +# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php + +""" +Creates a human-readable identifier, using numbers and digits, +avoiding ambiguous numbers and letters. hash_identifier can be used +to create compact representations that are unique for a certain string +(or concatenation of strings) +""" + +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + +import six + +good_characters = "23456789abcdefghjkmnpqrtuvwxyz" + +base = len(good_characters) + +def make_identifier(number): + """ + Encodes a number as an identifier. + """ + if not isinstance(number, six.integer_types): + raise ValueError( + "You can only make identifiers out of integers (not %r)" + % number) + if number < 0: + raise ValueError( + "You cannot make identifiers out of negative numbers: %r" + % number) + result = [] + while number: + next = number % base + result.append(good_characters[next]) + # Note, this depends on integer rounding of results: + number = number // base + return ''.join(result) + +def hash_identifier(s, length, pad=True, hasher=md5, prefix='', + group=None, upper=False): + """ + Hashes the string (with the given hashing module), then turns that + hash into an identifier of the given length (using modulo to + reduce the length of the identifier). If ``pad`` is False, then + the minimum-length identifier will be used; otherwise the + identifier will be padded with 0's as necessary. + + ``prefix`` will be added last, and does not count towards the + target length. ``group`` will group the characters with ``-`` in + the given lengths, and also does not count towards the target + length. E.g., ``group=4`` will cause a identifier like + ``a5f3-hgk3-asdf``. Grouping occurs before the prefix. + """ + if not callable(hasher): + # Accept sha/md5 modules as well as callables + hasher = hasher.new + if length > 26 and hasher is md5: + raise ValueError( + "md5 cannot create hashes longer than 26 characters in " + "length (you gave %s)" % length) + if isinstance(s, six.text_type): + s = s.encode('utf-8') + elif not isinstance(s, six.binary_type): + s = str(s) + if six.PY3: + s = s.encode('utf-8') + h = hasher(s) + bin_hash = h.digest() + modulo = base ** length + number = 0 + for c in list(bin_hash): + number = (number * 256 + six.byte2int([c])) % modulo + ident = make_identifier(number) + if pad: + ident = good_characters[0]*(length-len(ident)) + ident + if group: + parts = [] + while ident: + parts.insert(0, ident[-group:]) + ident = ident[:-group] + ident = '-'.join(parts) + if upper: + ident = ident.upper() + return prefix + ident + +# doctest tests: +__test__ = { + 'make_identifier': """ + >>> make_identifier(0) + '' + >>> make_identifier(1000) + 'c53' + >>> make_identifier(-100) + Traceback (most recent call last): + ... + ValueError: You cannot make identifiers out of negative numbers: -100 + >>> make_identifier('test') + Traceback (most recent call last): + ... + ValueError: You can only make identifiers out of integers (not 'test') + >>> make_identifier(1000000000000) + 'c53x9rqh3' + """, + 'hash_identifier': """ + >>> hash_identifier(0, 5) + 'cy2dr' + >>> hash_identifier(0, 10) + 'cy2dr6rg46' + >>> hash_identifier('this is a test of a long string', 5) + 'awatu' + >>> hash_identifier(0, 26) + 'cy2dr6rg46cx8t4w2f3nfexzk4' + >>> hash_identifier(0, 30) + Traceback (most recent call last): + ... + ValueError: md5 cannot create hashes longer than 26 characters in length (you gave 30) + >>> hash_identifier(0, 10, group=4) + 'cy-2dr6-rg46' + >>> hash_identifier(0, 10, group=4, upper=True, prefix='M-') + 'M-CY-2DR6-RG46' + """} + +if __name__ == '__main__': + import doctest + doctest.testmod() + |