diff options
Diffstat (limited to 'django/utils/encoding.py')
-rw-r--r-- | django/utils/encoding.py | 45 |
1 files changed, 41 insertions, 4 deletions
diff --git a/django/utils/encoding.py b/django/utils/encoding.py index 63f4193701..63f915b139 100644 --- a/django/utils/encoding.py +++ b/django/utils/encoding.py @@ -2,7 +2,7 @@ import codecs import datetime import locale from decimal import Decimal -from urllib.parse import quote, unquote_to_bytes +from urllib.parse import quote from django.utils import six from django.utils.functional import Promise @@ -151,20 +151,57 @@ def iri_to_uri(iri): return quote(iri, safe="/#%[]=:;$&()+,!?*@'~") +# List of byte values that uri_to_iri() decodes from percent encoding. +# First, the unreserved characters from RFC 3986: +_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)] +_hextobyte = { + (fmt % char).encode(): bytes((char,)) + for ascii_range in _ascii_ranges + for char in ascii_range + for fmt in ['%02x', '%02X'] +} +# And then everything above 128, because bytes ≥ 128 are part of multibyte +# unicode characters. +_hexdig = '0123456789ABCDEFabcdef' +_hextobyte.update({ + (a + b).encode(): bytes.fromhex(a + b) + for a in _hexdig[8:] for b in _hexdig +}) + + def uri_to_iri(uri): """ Converts a Uniform Resource Identifier(URI) into an Internationalized Resource Identifier(IRI). - This is the algorithm from section 3.2 of RFC 3987. + This is the algorithm from section 3.2 of RFC 3987, excluding step 4. Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns - a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/'). + a string containing the encoded result (e.g. '/I%20♥%20Django/'). """ if uri is None: return uri uri = force_bytes(uri) - iri = unquote_to_bytes(uri) + # Fast selective unqote: First, split on '%' and then starting with the + # second block, decode the first 2 bytes if they represent a hex code to + # decode. The rest of the block is the part after '%AB', not containing + # any '%'. Add that to the output without further processing. + bits = uri.split(b'%') + if len(bits) == 1: + iri = uri + else: + parts = [bits[0]] + append = parts.append + hextobyte = _hextobyte + for item in bits[1:]: + hex = item[:2] + if hex in hextobyte: + append(hextobyte[item[:2]]) + append(item[2:]) + else: + append(b'%') + append(item) + iri = b''.join(parts) return repercent_broken_unicode(iri).decode() |