summaryrefslogtreecommitdiff
path: root/django/utils/encoding.py
diff options
context:
space:
mode:
authorAnubhav Joshi <anubhav9042@gmail.com>2014-07-22 17:55:22 +0530
committerLoic Bistuer <loic.bistuer@gmail.com>2014-10-16 02:31:17 +0700
commit10b17a22bec2eaf44c3315614aea87c127caee46 (patch)
tree39145c16ca06aa33050e1642076db4216d663a10 /django/utils/encoding.py
parent3af5af1a61d73c533aca4fb0ea1f53e4f6300b17 (diff)
downloaddjango-10b17a22bec2eaf44c3315614aea87c127caee46.tar.gz
Fixed #19508 -- Implemented uri_to_iri as per RFC.
Thanks Loic Bistuer for helping in shaping the patch and Claude Paroz for the review.
Diffstat (limited to 'django/utils/encoding.py')
-rw-r--r--django/utils/encoding.py41
1 files changed, 39 insertions, 2 deletions
diff --git a/django/utils/encoding.py b/django/utils/encoding.py
index beb5e54ae8..3abee09c52 100644
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
from __future__ import unicode_literals
import codecs
@@ -7,7 +8,9 @@ import locale
from django.utils.functional import Promise
from django.utils import six
-from django.utils.six.moves.urllib.parse import quote
+from django.utils.six.moves.urllib.parse import quote, unquote
+if six.PY3:
+ from urllib.parse import unquote_to_bytes
class DjangoUnicodeDecodeError(UnicodeDecodeError):
@@ -185,7 +188,9 @@ def iri_to_uri(iri):
assuming input is either UTF-8 or unicode already, we can simplify things a
little from the full method.
- Returns an ASCII string containing the encoded result.
+ Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode
+ (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result
+ (e.g. '/I%20%E2%99%A5%20Django/').
"""
# The list of safe characters here is constructed from the "reserved" and
# "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
@@ -204,6 +209,38 @@ def iri_to_uri(iri):
return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
+def uri_to_iri(uri):
+ """
+ Converts a Uniform Resource Identifier(URI) into an Internationalized
+ Resource Identifier(IRI).
+
+ This is the algorithm from section 3.2 of RFC 3987.
+
+ Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns
+ unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/').
+ """
+ if uri is None:
+ return uri
+ uri = force_bytes(uri)
+ iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri)
+ return repercent_broken_unicode(iri).decode('utf-8')
+
+
+def repercent_broken_unicode(path):
+ """
+ As per section 3.2 of RFC 3987, step three of converting a URI into an IRI,
+ we need to re-percent-encode any octet produced that is not part of a
+ strictly legal UTF-8 octet sequence.
+ """
+ try:
+ path.decode('utf-8')
+ except UnicodeDecodeError as e:
+ repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~")
+ path = repercent_broken_unicode(
+ path[:e.start] + force_bytes(repercent) + path[e.end:])
+ return path
+
+
def filepath_to_uri(path):
"""Convert a file system path to a URI portion that is suitable for
inclusion in a URL.