summaryrefslogtreecommitdiff
path: root/paste/exceptions/serial_number_generator.py
diff options
context:
space:
mode:
authorMarc Abramowitz <marc@marc-abramowitz.com>2015-04-30 17:39:24 -0700
committerMarc Abramowitz <marc@marc-abramowitz.com>2015-04-30 17:39:24 -0700
commitfa100c92c06d3a8a61a0dda1a2e06018437b09c6 (patch)
treea1cc50f93fbf257685c3849e03496c5e33949281 /paste/exceptions/serial_number_generator.py
downloadpaste-git-test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1.tar.gz
test_wsgirequest_charset: Use UTF-8 instead of iso-8859-1test_wsgirequest_charset_use_UTF-8_instead_of_iso-8859-1
because it seems that the defacto standard for encoding URIs is to use UTF-8. I've been reading about url encoding and it seems like perhaps using an encoding other than UTF-8 is very non-standard and not well-supported (this test is trying to use `iso-8859-1`). From http://en.wikipedia.org/wiki/Percent-encoding > For a non-ASCII character, it is typically converted to its byte sequence in > UTF-8, and then each byte value is represented as above. > The generic URI syntax mandates that new URI schemes that provide for the > representation of character data in a URI must, in effect, represent > characters from the unreserved set without translation, and should convert > all other characters to bytes according to UTF-8, and then percent-encode > those values. This requirement was introduced in January 2005 with the > publication of RFC 3986 From http://tools.ietf.org/html/rfc3986: > Non-ASCII characters must first be encoded according to UTF-8 [STD63], and > then each octet of the corresponding UTF-8 sequence must be percent-encoded > to be represented as URI characters. URI producing applications must not use > percent-encoding in host unless it is used to represent a UTF-8 character > sequence. From http://tools.ietf.org/html/rfc3987: > Conversions from URIs to IRIs MUST NOT use any character encoding other than > UTF-8 in steps 3 and 4, even if it might be possible to guess from the > context that another character encoding than UTF-8 was used in the URI. For > example, the URI "http://www.example.org/r%E9sum%E9.html" might with some > guessing be interpreted to contain two e-acute characters encoded as > iso-8859-1. It must not be converted to an IRI containing these e-acute > characters. Otherwise, in the future the IRI will be mapped to > "http://www.example.org/r%C3%A9sum%C3%A9.html", which is a different URI from > "http://www.example.org/r%E9sum%E9.html". See issue #7, which I think this at least partially fixes.
Diffstat (limited to 'paste/exceptions/serial_number_generator.py')
-rw-r--r--paste/exceptions/serial_number_generator.py129
1 files changed, 129 insertions, 0 deletions
diff --git a/paste/exceptions/serial_number_generator.py b/paste/exceptions/serial_number_generator.py
new file mode 100644
index 0000000..3f80107
--- /dev/null
+++ b/paste/exceptions/serial_number_generator.py
@@ -0,0 +1,129 @@
+# (c) 2005 Ian Bicking and contributors; written for Paste (http://pythonpaste.org)
+# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php
+
+"""
+Creates a human-readable identifier, using numbers and digits,
+avoiding ambiguous numbers and letters. hash_identifier can be used
+to create compact representations that are unique for a certain string
+(or concatenation of strings)
+"""
+
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
+
+import six
+
+good_characters = "23456789abcdefghjkmnpqrtuvwxyz"
+
+base = len(good_characters)
+
+def make_identifier(number):
+ """
+ Encodes a number as an identifier.
+ """
+ if not isinstance(number, six.integer_types):
+ raise ValueError(
+ "You can only make identifiers out of integers (not %r)"
+ % number)
+ if number < 0:
+ raise ValueError(
+ "You cannot make identifiers out of negative numbers: %r"
+ % number)
+ result = []
+ while number:
+ next = number % base
+ result.append(good_characters[next])
+ # Note, this depends on integer rounding of results:
+ number = number // base
+ return ''.join(result)
+
+def hash_identifier(s, length, pad=True, hasher=md5, prefix='',
+ group=None, upper=False):
+ """
+ Hashes the string (with the given hashing module), then turns that
+ hash into an identifier of the given length (using modulo to
+ reduce the length of the identifier). If ``pad`` is False, then
+ the minimum-length identifier will be used; otherwise the
+ identifier will be padded with 0's as necessary.
+
+ ``prefix`` will be added last, and does not count towards the
+ target length. ``group`` will group the characters with ``-`` in
+ the given lengths, and also does not count towards the target
+ length. E.g., ``group=4`` will cause a identifier like
+ ``a5f3-hgk3-asdf``. Grouping occurs before the prefix.
+ """
+ if not callable(hasher):
+ # Accept sha/md5 modules as well as callables
+ hasher = hasher.new
+ if length > 26 and hasher is md5:
+ raise ValueError(
+ "md5 cannot create hashes longer than 26 characters in "
+ "length (you gave %s)" % length)
+ if isinstance(s, six.text_type):
+ s = s.encode('utf-8')
+ elif not isinstance(s, six.binary_type):
+ s = str(s)
+ if six.PY3:
+ s = s.encode('utf-8')
+ h = hasher(s)
+ bin_hash = h.digest()
+ modulo = base ** length
+ number = 0
+ for c in list(bin_hash):
+ number = (number * 256 + six.byte2int([c])) % modulo
+ ident = make_identifier(number)
+ if pad:
+ ident = good_characters[0]*(length-len(ident)) + ident
+ if group:
+ parts = []
+ while ident:
+ parts.insert(0, ident[-group:])
+ ident = ident[:-group]
+ ident = '-'.join(parts)
+ if upper:
+ ident = ident.upper()
+ return prefix + ident
+
+# doctest tests:
+__test__ = {
+ 'make_identifier': """
+ >>> make_identifier(0)
+ ''
+ >>> make_identifier(1000)
+ 'c53'
+ >>> make_identifier(-100)
+ Traceback (most recent call last):
+ ...
+ ValueError: You cannot make identifiers out of negative numbers: -100
+ >>> make_identifier('test')
+ Traceback (most recent call last):
+ ...
+ ValueError: You can only make identifiers out of integers (not 'test')
+ >>> make_identifier(1000000000000)
+ 'c53x9rqh3'
+ """,
+ 'hash_identifier': """
+ >>> hash_identifier(0, 5)
+ 'cy2dr'
+ >>> hash_identifier(0, 10)
+ 'cy2dr6rg46'
+ >>> hash_identifier('this is a test of a long string', 5)
+ 'awatu'
+ >>> hash_identifier(0, 26)
+ 'cy2dr6rg46cx8t4w2f3nfexzk4'
+ >>> hash_identifier(0, 30)
+ Traceback (most recent call last):
+ ...
+ ValueError: md5 cannot create hashes longer than 26 characters in length (you gave 30)
+ >>> hash_identifier(0, 10, group=4)
+ 'cy-2dr6-rg46'
+ >>> hash_identifier(0, 10, group=4, upper=True, prefix='M-')
+ 'M-CY-2DR6-RG46'
+ """}
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()
+