Refactor urinorm

author: Vlastimil Zíma <vlastimil.zima@nic.cz> 2018-03-09 13:37:22 +0100
committer: Vlastimil Zíma <vlastimil.zima@nic.cz> 2018-04-30 14:20:16 +0200
commit: a1f864ada10d00edc1e58e5ecb97fab6ed319a62 (patch)
tree: 71072773c20d15444bc63073863cfbcef28563f9 /openid
parent: f638838a14956ca608368d2efa50e73b282a8581 (diff)
download: openid-a1f864ada10d00edc1e58e5ecb97fab6ed319a62.tar.gz
4 files changed, 156 insertions, 262 deletions
diff --git a/openid/test/data/trustroot.txt b/openid/test/data/trustroot.txt
index 3d948a4..f46ec08 100644
--- a/openid/test/data/trustroot.txt
+++ b/openid/test/data/trustroot.txt
@@ -3,32 +3,31 @@ Trust root parsing checking
 ========================================
 
 ----------------------------------------
-21: Does not parse
+20: Does not parse
 ----------------------------------------
 baz.org
 *.foo.com
 http://*.schtuff.*/
 ftp://foo.com
 ftp://*.foo.com
-http://*.foo.com:80:90/
 http:///
 http://
 foo.*.com
 http://foo.*.com
 http://www.*
 http://*foo.com/
+http://.it/
+http://..it/
 http://foo.com\/
 http://localhost:1900foo/
 http://foo.com/invalid#fragment
-http://π.pi.com/
-http://lambda.com/Λ
 
  
  	
 5
 
 ----------------------------------------
-15: Insane
+13: Insane
 ----------------------------------------
 http://*/
 https://*/
@@ -43,11 +42,9 @@ http://*.museum/
 https://*.museum/
 http://www.schtuffcom/
 http://it/
-http://..it/
-http://.it/
 
 ----------------------------------------
-18: Sane
+21: Sane
 ----------------------------------------
 http://*.schtuff.com./
 http://*.schtuff.com/
@@ -67,6 +64,9 @@ https://foo.com/
 http://kink.fm/should/be/sane
 http://beta.lingu.no/
 http://goathack.livejournal.org:8020/openid/login.bml
+http://*.example.com:80:90/
+http://π.pi.example.com/
+http://lambda.example.com/Λ
 
 ========================================
 return_to matching
diff --git a/openid/test/test_urinorm.py b/openid/test/test_urinorm.py
index 0db74eb..50b5355 100644
--- a/openid/test/test_urinorm.py
+++ b/openid/test/test_urinorm.py
@@ -1,30 +1,77 @@
-import os
-import unittest
+# -*- coding: utf-8 -*-
+"""Tests for `openid.urinorm` module."""
+from __future__ import unicode_literals
 
-import openid.urinorm
+import unittest
 
-with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'urinorm.txt')) as test_data_file:
-    test_data = test_data_file.read()
+from openid.urinorm import urinorm
 
 
 class UrinormTest(unittest.TestCase):
+    """Test `urinorm` function."""
+
+    def test_normalized(self):
+        self.assertEqual(urinorm('http://example.com/'), 'http://example.com/')
+        self.assertEqual(urinorm(b'http://example.com/'), 'http://example.com/')
+
+    def test_lowercase_scheme(self):
+        self.assertEqual(urinorm('htTP://example.com/'), 'http://example.com/')
+
+    def test_unsupported_scheme(self):
+        self.assertRaisesRegexp(ValueError, 'Not an absolute HTTP or HTTPS URI', urinorm, 'ftp://example.com/')
+
+    def test_lowercase_hostname(self):
+        self.assertEqual(urinorm('http://exaMPLE.COm/'), 'http://example.com/')
+
+    def test_idn_hostname(self):
+        self.assertEqual(urinorm('http://π.example.com/'), 'http://xn--1xa.example.com/')
+
+    def test_empty_hostname(self):
+        self.assertEqual(urinorm('http://username@/'), 'http://username@/')
+
+    def test_invalid_hostname(self):
+        self.assertRaisesRegexp(ValueError, 'Invalid hostname', urinorm, 'http://.it/')
+        self.assertRaisesRegexp(ValueError, 'Invalid hostname', urinorm, 'http://..it/')
+        self.assertRaisesRegexp(ValueError, 'Not an absolute URI', urinorm, 'http:///path/')
+
+    def test_empty_port_section(self):
+        self.assertEqual(urinorm('http://example.com:/'), 'http://example.com/')
+
+    def test_default_ports(self):
+        self.assertEqual(urinorm('http://example.com:80/'), 'http://example.com/')
+        self.assertEqual(urinorm('https://example.com:443/'), 'https://example.com/')
+
+    def test_empty_path(self):
+        self.assertEqual(urinorm('http://example.com'), 'http://example.com/')
+
+    def test_path_dots(self):
+        self.assertEqual(urinorm('http://example.com/./a'), 'http://example.com/a')
+        self.assertEqual(urinorm('http://example.com/../a'), 'http://example.com/a')
+
+        self.assertEqual(urinorm('http://example.com/a/.'), 'http://example.com/a/')
+        self.assertEqual(urinorm('http://example.com/a/..'), 'http://example.com/')
+        self.assertEqual(urinorm('http://example.com/a/./'), 'http://example.com/a/')
+        self.assertEqual(urinorm('http://example.com/a/../'), 'http://example.com/')
+
+        self.assertEqual(urinorm('http://example.com/a/./b'), 'http://example.com/a/b')
+        self.assertEqual(urinorm('http://example.com/a/../b'), 'http://example.com/b')
+
+        self.assertEqual(urinorm('http://example.com/a/b/c/./../../g'), 'http://example.com/a/g')
+        self.assertEqual(urinorm('http://example.com/mid/content=5/../6'), 'http://example.com/mid/6')
+
+    def test_path_percent_encoding(self):
+        self.assertEqual(urinorm('http://example.com/'), 'http://example.com/%08')
+        self.assertEqual(urinorm('http://example.com/Λ'), 'http://example.com/%CE%9B')
+
+    def test_path_capitalize_percent_encoding(self):
+        self.assertEqual(urinorm('http://example.com/foo%2cbar'), 'http://example.com/foo%2Cbar')
+
+    def test_path_percent_decode_unreserved(self):
+        self.assertEqual(urinorm('http://example.com/foo%2Dbar%2dbaz'), 'http://example.com/foo-bar-baz')
+
+    def test_illegal_characters(self):
+        self.assertRaisesRegexp(ValueError, 'Illegal characters in URI', urinorm, 'http://<illegal>.com/')
 
-    def runTest(self):
-        for case in test_data.split('\n\n'):
-            case = case.strip()
-            if not case:
-                continue
-
-            desc, raw, expected = self.parse(case)
-            try:
-                actual = openid.urinorm.urinorm(raw)
-            except ValueError as why:
-                self.assertEqual(expected, 'fail', why)
-            else:
-                self.assertEqual(actual, expected, desc)
-
-    def parse(self, full_case):
-        desc, case, expected = full_case.split('\n')
-        case = unicode(case, 'utf-8')
-
-        return (desc, case, expected)
+    def test_realms(self):
+        # Urinorm supports OpenID realms with * in them
+        self.assertEqual(urinorm('http://*.example.com/'), 'http://*.example.com/')
diff --git a/openid/test/urinorm.txt b/openid/test/urinorm.txt
deleted file mode 100644
index a5db39e..0000000
--- a/openid/test/urinorm.txt
+++ /dev/null
@@ -1,87 +0,0 @@
-Already normal form
-http://example.com/
-http://example.com/
-
-Add a trailing slash
-http://example.com
-http://example.com/
-
-Remove an empty port segment
-http://example.com:/
-http://example.com/
-
-Remove a default port segment
-http://example.com:80/
-http://example.com/
-
-Capitalization in host names
-http://wWw.exaMPLE.COm/
-http://www.example.com/
-
-Capitalization in scheme names
-htTP://example.com/
-http://example.com/
-
-Capitalization in percent-escaped reserved characters
-http://example.com/foo%2cbar
-http://example.com/foo%2Cbar
-
-Unescape percent-encoded unreserved characters
-http://example.com/foo%2Dbar%2dbaz
-http://example.com/foo-bar-baz
-
-remove_dot_segments example 1
-http://example.com/a/b/c/./../../g
-http://example.com/a/g
-
-remove_dot_segments example 2
-http://example.com/mid/content=5/../6
-http://example.com/mid/6
-
-remove_dot_segments: single-dot
-http://example.com/a/./b
-http://example.com/a/b
-
-remove_dot_segments: double-dot
-http://example.com/a/../b
-http://example.com/b
-
-remove_dot_segments: leading double-dot
-http://example.com/../b
-http://example.com/b
-
-remove_dot_segments: trailing single-dot
-http://example.com/a/.
-http://example.com/a/
-
-remove_dot_segments: trailing double-dot
-http://example.com/a/..
-http://example.com/
-
-remove_dot_segments: trailing single-dot-slash
-http://example.com/a/./
-http://example.com/a/
-
-remove_dot_segments: trailing double-dot-slash
-http://example.com/a/../
-http://example.com/
-
-Test of all kinds of syntax-based normalization
-hTTPS://a/./b/../b/%63/%7bfoo%7d
-https://a/b/c/%7Bfoo%7D
-
-Unsupported scheme
-ftp://example.com/
-fail
-
-Non-absolute URI
-http:/foo
-fail
-
-Illegal character in URI
-http://<illegal>.com/
-fail
-
-Non-ascii character in URI
-http://foo.com/
-fail
diff --git a/openid/urinorm.py b/openid/urinorm.py
index e7127d3..0da86ee 100644
--- a/openid/urinorm.py
+++ b/openid/urinorm.py
@@ -1,105 +1,12 @@
-import re
-
-# from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt)
-uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
-uri_re = re.compile(uri_pattern)
-
-# gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
-#
-# sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
-#                  / "*" / "+" / "," / ";" / "="
-#
-# unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
-
-uri_illegal_char_re = re.compile(
-    "[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re.UNICODE)
-
-authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?'
-authority_re = re.compile(authority_pattern)
-
-
-pct_encoded_pattern = r'%([0-9A-Fa-f]{2})'
-pct_encoded_re = re.compile(pct_encoded_pattern)
-
-try:
-    unichr(0x10000)
-except ValueError:
-    # narrow python build
-    UCSCHAR = [
-        (0xA0, 0xD7FF),
-        (0xF900, 0xFDCF),
-        (0xFDF0, 0xFFEF),
-    ]
-
-    IPRIVATE = [
-        (0xE000, 0xF8FF),
-    ]
-else:
-    UCSCHAR = [
-        (0xA0, 0xD7FF),
-        (0xF900, 0xFDCF),
-        (0xFDF0, 0xFFEF),
-        (0x10000, 0x1FFFD),
-        (0x20000, 0x2FFFD),
-        (0x30000, 0x3FFFD),
-        (0x40000, 0x4FFFD),
-        (0x50000, 0x5FFFD),
-        (0x60000, 0x6FFFD),
-        (0x70000, 0x7FFFD),
-        (0x80000, 0x8FFFD),
-        (0x90000, 0x9FFFD),
-        (0xA0000, 0xAFFFD),
-        (0xB0000, 0xBFFFD),
-        (0xC0000, 0xCFFFD),
-        (0xD0000, 0xDFFFD),
-        (0xE1000, 0xEFFFD),
-    ]
-
-    IPRIVATE = [
-        (0xE000, 0xF8FF),
-        (0xF0000, 0xFFFFD),
-        (0x100000, 0x10FFFD),
-    ]
-
-
-_unreserved = [False] * 256
-for _ in range(ord('A'), ord('Z') + 1):
-    _unreserved[_] = True
-for _ in range(ord('0'), ord('9') + 1):
-    _unreserved[_] = True
-for _ in range(ord('a'), ord('z') + 1):
-    _unreserved[_] = True
-_unreserved[ord('-')] = True
-_unreserved[ord('.')] = True
-_unreserved[ord('_')] = True
-_unreserved[ord('~')] = True
-
-
-_escapeme_re = re.compile('[%s]' % ''.join(u'%s-%s' % (unichr(m_n[0]), unichr(m_n[1])) for m_n in UCSCHAR + IPRIVATE))
-
-
-def _pct_escape_unicode(char_match):
-    c = char_match.group()
-    return ''.join(['%%%X' % (ord(octet),) for octet in c.encode('utf-8')])
-
-
-def _pct_encoded_replace_unreserved(mo):
-    try:
-        i = int(mo.group(1), 16)
-        if _unreserved[i]:
-            return chr(i)
-        else:
-            return mo.group().upper()
+"""URI normalization utilities."""
+from __future__ import unicode_literals
 
-    except ValueError:
-        return mo.group()
+import string
+import warnings
+from urllib import quote, unquote, urlencode
+from urlparse import parse_qsl, urlsplit, urlunsplit
 
-
-def _pct_encoded_replace(mo):
-    try:
-        return chr(int(mo.group(1), 16))
-    except ValueError:
-        return mo.group()
+import six
 
 
 def remove_dot_segments(path):
@@ -137,65 +44,92 @@ def remove_dot_segments(path):
     return ''.join(result_segments)
 
 
-def urinorm(uri):
-    if isinstance(uri, unicode):
-        uri = _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii')
+GEN_DELIMS = ":" + "/" + "?" + "#" + "[" + "]" + "@"
+SUB_DELIMS = "!" + "$" + "&" + "'" + "(" + ")" + "*" + "+" + "," + ";" + "="
+RESERVED = GEN_DELIMS + SUB_DELIMS
+UNRESERVED = string.ascii_letters + string.digits + "-" + "." + "_" + "~"
+# Allow "%" as percent encoding character
+PERCENT_ENCODING_CHARACTER = "%"
 
-    illegal_mo = uri_illegal_char_re.search(uri)
-    if illegal_mo:
-        raise ValueError('Illegal characters in URI: %r at position %s' %
-                         (illegal_mo.group(), illegal_mo.start()))
 
-    uri_mo = uri_re.match(uri)
+def _check_disallowed_characters(uri_part, part_name):
+    # Roughly check the allowed characters. The check in not strict according to URI ABNF, but good enough.
+    # Also allow "%" for percent encoding.
+    if set(uri_part).difference(set(UNRESERVED + RESERVED + PERCENT_ENCODING_CHARACTER)):
+        raise ValueError('Illegal characters in URI {}: {}'.format(part_name, uri_part))
 
-    scheme = uri_mo.group(2)
-    if scheme is None:
-        raise ValueError('No scheme specified')
 
-    scheme = scheme.lower()
-    if scheme not in ('http', 'https'):
-        raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri,))
+def urinorm(uri):
+    """Return normalized URI.
 
-    authority = uri_mo.group(4)
-    if authority is None:
-        raise ValueError('Not an absolute URI: %r' % (uri,))
+    Normalization if performed according to RFC 3986, section 6 https://tools.ietf.org/html/rfc3986#section-6.
+    Supported URIs are URLs and OpenID realm URIs.
 
-    authority_mo = authority_re.match(authority)
-    if authority_mo is None:
-        raise ValueError('URI does not have a valid authority: %r' % (uri,))
+    @type uri: six.text_type, six.binary_type deprecated
+    @rtype: six.text_type
+    @raise ValueError: If URI is invalid.
+    """
+    # Transform the input to the unicode string
+    if isinstance(uri, six.binary_type):
+        warnings.warn("Binary input for urinorm is deprecated. Use text input instead.", DeprecationWarning)
+        uri = uri.decode('utf-8')
 
-    userinfo, host, port = authority_mo.groups()
+    split_uri = urlsplit(uri)
 
-    if userinfo is None:
-        userinfo = ''
+    # Normalize scheme
+    scheme = split_uri.scheme.lower()
+    if scheme not in ('http', 'https'):
+        raise ValueError('Not an absolute HTTP or HTTPS URI: {!r}'.format(uri))
 
-    if '%' in host:
-        host = host.lower()
-        host = pct_encoded_re.sub(_pct_encoded_replace, host)
-        host = unicode(host, 'utf-8').encode('idna')
-    else:
-        host = host.lower()
+    # Normalize netloc
+    if not split_uri.netloc:
+        raise ValueError('Not an absolute URI: {!r}'.format(uri))
 
-    if port:
-        if port == ':' or (scheme == 'http' and port == ':80') or (scheme == 'https' and port == ':443'):
-            port = ''
+    hostname = split_uri.hostname
+    if hostname is None:
+        hostname = ''
     else:
-        port = ''
+        hostname = hostname.lower()
+    # Unquote percent encoded characters
+    hostname = unquote(hostname)
+    # Quote IDN domain names
+    try:
+        hostname = hostname.encode('idna')
+    except ValueError as error:
+        raise ValueError('Invalid hostname {!r}: {}'.format(hostname, error))
+    _check_disallowed_characters(hostname, 'hostname')
 
-    authority = userinfo + host + port
+    port = split_uri.port
+    if port is None:
+        port = ''
+    elif (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443):
+        port = ''
 
-    path = uri_mo.group(5)
-    path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path)
+    netloc = hostname
+    if port:
+        netloc = netloc + ':' + str(port)
+    userinfo_chunks = [i for i in (split_uri.username, split_uri.password) if i is not None]
+    if userinfo_chunks:
+        userinfo = ':'.join(userinfo_chunks)
+        _check_disallowed_characters(userinfo, 'userinfo')
+        netloc = userinfo + '@' + netloc
+
+    # Normalize path
+    path = split_uri.path
+    # Unquote and quote - this normalizes the percent encoding
+    path = quote(unquote(path.encode('utf-8'))).decode('utf-8')
     path = remove_dot_segments(path)
     if not path:
         path = '/'
+    _check_disallowed_characters(path, 'path')
 
-    query = uri_mo.group(6)
-    if query is None:
-        query = ''
+    # Normalize query
+    data = parse_qsl(split_uri.query)
+    query = urlencode(data)
+    _check_disallowed_characters(query, 'query')
 
-    fragment = uri_mo.group(8)
-    if fragment is None:
-        fragment = ''
+    # Normalize fragment
+    fragment = unquote(split_uri.fragment)
+    _check_disallowed_characters(fragment, 'fragment')
 
-    return scheme + '://' + authority + path + query + fragment
+    return urlunsplit((scheme, netloc, path, query, fragment))
author	Vlastimil Zíma <vlastimil.zima@nic.cz>	2018-03-09 13:37:22 +0100
committer	Vlastimil Zíma <vlastimil.zima@nic.cz>	2018-04-30 14:20:16 +0200
commit	a1f864ada10d00edc1e58e5ecb97fab6ed319a62 (patch)
tree	71072773c20d15444bc63073863cfbcef28563f9 /openid
parent	f638838a14956ca608368d2efa50e73b282a8581 (diff)
download	openid-a1f864ada10d00edc1e58e5ecb97fab6ed319a62.tar.gz