diff options
author | Anubhav Joshi <anubhav9042@gmail.com> | 2014-07-22 17:55:22 +0530 |
---|---|---|
committer | Loic Bistuer <loic.bistuer@gmail.com> | 2014-10-16 02:31:17 +0700 |
commit | 10b17a22bec2eaf44c3315614aea87c127caee46 (patch) | |
tree | 39145c16ca06aa33050e1642076db4216d663a10 /django/utils/encoding.py | |
parent | 3af5af1a61d73c533aca4fb0ea1f53e4f6300b17 (diff) | |
download | django-10b17a22bec2eaf44c3315614aea87c127caee46.tar.gz |
Fixed #19508 -- Implemented uri_to_iri as per RFC.
Thanks Loic Bistuer for helping in shaping the patch and Claude Paroz
for the review.
Diffstat (limited to 'django/utils/encoding.py')
-rw-r--r-- | django/utils/encoding.py | 41 |
1 files changed, 39 insertions, 2 deletions
diff --git a/django/utils/encoding.py b/django/utils/encoding.py index beb5e54ae8..3abee09c52 100644 --- a/django/utils/encoding.py +++ b/django/utils/encoding.py @@ -1,3 +1,4 @@ +# -*- encoding: utf-8 -*- from __future__ import unicode_literals import codecs @@ -7,7 +8,9 @@ import locale from django.utils.functional import Promise from django.utils import six -from django.utils.six.moves.urllib.parse import quote +from django.utils.six.moves.urllib.parse import quote, unquote +if six.PY3: + from urllib.parse import unquote_to_bytes class DjangoUnicodeDecodeError(UnicodeDecodeError): @@ -185,7 +188,9 @@ def iri_to_uri(iri): assuming input is either UTF-8 or unicode already, we can simplify things a little from the full method. - Returns an ASCII string containing the encoded result. + Takes an IRI in UTF-8 bytes (e.g. '/I \xe2\x99\xa5 Django/') or unicode + (e.g. '/I ♥ Django/') and returns ASCII bytes containing the encoded result + (e.g. '/I%20%E2%99%A5%20Django/'). """ # The list of safe characters here is constructed from the "reserved" and # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986: @@ -204,6 +209,38 @@ def iri_to_uri(iri): return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~") +def uri_to_iri(uri): + """ + Converts a Uniform Resource Identifier(URI) into an Internationalized + Resource Identifier(IRI). + + This is the algorithm from section 3.2 of RFC 3987. + + Takes an URI in ASCII bytes (e.g. '/I%20%E2%99%A5%20Django/') and returns + unicode containing the encoded result (e.g. '/I \xe2\x99\xa5 Django/'). + """ + if uri is None: + return uri + uri = force_bytes(uri) + iri = unquote_to_bytes(uri) if six.PY3 else unquote(uri) + return repercent_broken_unicode(iri).decode('utf-8') + + +def repercent_broken_unicode(path): + """ + As per section 3.2 of RFC 3987, step three of converting a URI into an IRI, + we need to re-percent-encode any octet produced that is not part of a + strictly legal UTF-8 octet sequence. + """ + try: + path.decode('utf-8') + except UnicodeDecodeError as e: + repercent = quote(path[e.start:e.end], safe=b"/#%[]=:;$&()+,!?*@'~") + path = repercent_broken_unicode( + path[:e.start] + force_bytes(repercent) + path[e.end:]) + return path + + def filepath_to_uri(path): """Convert a file system path to a URI portion that is suitable for inclusion in a URL. |