summaryrefslogtreecommitdiff
path: root/django/utils/encoding.py
diff options
context:
space:
mode:
Diffstat (limited to 'django/utils/encoding.py')
-rw-r--r--django/utils/encoding.py45
1 files changed, 41 insertions, 4 deletions
diff --git a/django/utils/encoding.py b/django/utils/encoding.py
index 63f4193701..63f915b139 100644
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -2,7 +2,7 @@ import codecs
import datetime
import locale
from decimal import Decimal
-from urllib.parse import quote, unquote_to_bytes
+from urllib.parse import quote
from django.utils import six
from django.utils.functional import Promise
@@ -151,20 +151,57 @@ def iri_to_uri(iri):
return quote(iri, safe="/#%[]=:;$&()+,!?*@'~")
+# List of byte values that uri_to_iri() decodes from percent encoding.
+# First, the unreserved characters from RFC 3986:
+_ascii_ranges = [[45, 46, 95, 126], range(65, 91), range(97, 123)]
+_hextobyte = {
+ (fmt % char).encode(): bytes((char,))
+ for ascii_range in _ascii_ranges
+ for char in ascii_range
+ for fmt in ['%02x', '%02X']
+}
+# And then everything above 128, because bytes ≥ 128 are part of multibyte
+# unicode characters.
+_hexdig = '0123456789ABCDEFabcdef'
+_hextobyte.update({
+ (a + b).encode(): bytes.fromhex(a + b)
+ for a in _hexdig[8:] for b in _hexdig
+})
+
+
def uri_to_iri(uri):
"""
Converts a Uniform Resource Identifier(URI) into an Internationalized
Resource Identifier(IRI).
- This is the algorithm from section 3.2 of RFC 3987.
+ This is the algorithm from section 3.2 of RFC 3987, excluding step 4.
Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
- a string containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
+ a string containing the encoded result (e.g. '/I%20♥%20Django/').
"""
if uri is None:
return uri
uri = force_bytes(uri)
- iri = unquote_to_bytes(uri)
+ # Fast selective unqote: First, split on '%' and then starting with the
+ # second block, decode the first 2 bytes if they represent a hex code to
+ # decode. The rest of the block is the part after '%AB', not containing
+ # any '%'. Add that to the output without further processing.
+ bits = uri.split(b'%')
+ if len(bits) == 1:
+ iri = uri
+ else:
+ parts = [bits[0]]
+ append = parts.append
+ hextobyte = _hextobyte
+ for item in bits[1:]:
+ hex = item[:2]
+ if hex in hextobyte:
+ append(hextobyte[item[:2]])
+ append(item[2:])
+ else:
+ append(b'%')
+ append(item)
+ iri = b''.join(parts)
return repercent_broken_unicode(iri).decode()