diff options
author | Vlastimil Zíma <vlastimil.zima@nic.cz> | 2018-03-09 13:37:22 +0100 |
---|---|---|
committer | Vlastimil Zíma <vlastimil.zima@nic.cz> | 2018-04-30 14:20:16 +0200 |
commit | a1f864ada10d00edc1e58e5ecb97fab6ed319a62 (patch) | |
tree | 71072773c20d15444bc63073863cfbcef28563f9 /openid | |
parent | f638838a14956ca608368d2efa50e73b282a8581 (diff) | |
download | openid-a1f864ada10d00edc1e58e5ecb97fab6ed319a62.tar.gz |
Refactor urinorm
Diffstat (limited to 'openid')
-rw-r--r-- | openid/test/data/trustroot.txt | 16 | ||||
-rw-r--r-- | openid/test/test_urinorm.py | 95 | ||||
-rw-r--r-- | openid/test/urinorm.txt | 87 | ||||
-rw-r--r-- | openid/urinorm.py | 220 |
4 files changed, 156 insertions, 262 deletions
diff --git a/openid/test/data/trustroot.txt b/openid/test/data/trustroot.txt index 3d948a4..f46ec08 100644 --- a/openid/test/data/trustroot.txt +++ b/openid/test/data/trustroot.txt @@ -3,32 +3,31 @@ Trust root parsing checking ======================================== ---------------------------------------- -21: Does not parse +20: Does not parse ---------------------------------------- baz.org *.foo.com http://*.schtuff.*/ ftp://foo.com ftp://*.foo.com -http://*.foo.com:80:90/ http:/// http:// foo.*.com http://foo.*.com http://www.* http://*foo.com/ +http://.it/ +http://..it/ http://foo.com\/ http://localhost:1900foo/ http://foo.com/invalid#fragment -http://π.pi.com/ -http://lambda.com/Λ 5 ---------------------------------------- -15: Insane +13: Insane ---------------------------------------- http://*/ https://*/ @@ -43,11 +42,9 @@ http://*.museum/ https://*.museum/ http://www.schtuffcom/ http://it/ -http://..it/ -http://.it/ ---------------------------------------- -18: Sane +21: Sane ---------------------------------------- http://*.schtuff.com./ http://*.schtuff.com/ @@ -67,6 +64,9 @@ https://foo.com/ http://kink.fm/should/be/sane http://beta.lingu.no/ http://goathack.livejournal.org:8020/openid/login.bml +http://*.example.com:80:90/ +http://π.pi.example.com/ +http://lambda.example.com/Λ ======================================== return_to matching diff --git a/openid/test/test_urinorm.py b/openid/test/test_urinorm.py index 0db74eb..50b5355 100644 --- a/openid/test/test_urinorm.py +++ b/openid/test/test_urinorm.py @@ -1,30 +1,77 @@ -import os -import unittest +# -*- coding: utf-8 -*- +"""Tests for `openid.urinorm` module.""" +from __future__ import unicode_literals -import openid.urinorm +import unittest -with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'urinorm.txt')) as test_data_file: - test_data = test_data_file.read() +from openid.urinorm import urinorm class UrinormTest(unittest.TestCase): + """Test `urinorm` function.""" + + def test_normalized(self): + self.assertEqual(urinorm('http://example.com/'), 'http://example.com/') + self.assertEqual(urinorm(b'http://example.com/'), 'http://example.com/') + + def test_lowercase_scheme(self): + self.assertEqual(urinorm('htTP://example.com/'), 'http://example.com/') + + def test_unsupported_scheme(self): + self.assertRaisesRegexp(ValueError, 'Not an absolute HTTP or HTTPS URI', urinorm, 'ftp://example.com/') + + def test_lowercase_hostname(self): + self.assertEqual(urinorm('http://exaMPLE.COm/'), 'http://example.com/') + + def test_idn_hostname(self): + self.assertEqual(urinorm('http://π.example.com/'), 'http://xn--1xa.example.com/') + + def test_empty_hostname(self): + self.assertEqual(urinorm('http://username@/'), 'http://username@/') + + def test_invalid_hostname(self): + self.assertRaisesRegexp(ValueError, 'Invalid hostname', urinorm, 'http://.it/') + self.assertRaisesRegexp(ValueError, 'Invalid hostname', urinorm, 'http://..it/') + self.assertRaisesRegexp(ValueError, 'Not an absolute URI', urinorm, 'http:///path/') + + def test_empty_port_section(self): + self.assertEqual(urinorm('http://example.com:/'), 'http://example.com/') + + def test_default_ports(self): + self.assertEqual(urinorm('http://example.com:80/'), 'http://example.com/') + self.assertEqual(urinorm('https://example.com:443/'), 'https://example.com/') + + def test_empty_path(self): + self.assertEqual(urinorm('http://example.com'), 'http://example.com/') + + def test_path_dots(self): + self.assertEqual(urinorm('http://example.com/./a'), 'http://example.com/a') + self.assertEqual(urinorm('http://example.com/../a'), 'http://example.com/a') + + self.assertEqual(urinorm('http://example.com/a/.'), 'http://example.com/a/') + self.assertEqual(urinorm('http://example.com/a/..'), 'http://example.com/') + self.assertEqual(urinorm('http://example.com/a/./'), 'http://example.com/a/') + self.assertEqual(urinorm('http://example.com/a/../'), 'http://example.com/') + + self.assertEqual(urinorm('http://example.com/a/./b'), 'http://example.com/a/b') + self.assertEqual(urinorm('http://example.com/a/../b'), 'http://example.com/b') + + self.assertEqual(urinorm('http://example.com/a/b/c/./../../g'), 'http://example.com/a/g') + self.assertEqual(urinorm('http://example.com/mid/content=5/../6'), 'http://example.com/mid/6') + + def test_path_percent_encoding(self): + self.assertEqual(urinorm('http://example.com/'), 'http://example.com/%08') + self.assertEqual(urinorm('http://example.com/Λ'), 'http://example.com/%CE%9B') + + def test_path_capitalize_percent_encoding(self): + self.assertEqual(urinorm('http://example.com/foo%2cbar'), 'http://example.com/foo%2Cbar') + + def test_path_percent_decode_unreserved(self): + self.assertEqual(urinorm('http://example.com/foo%2Dbar%2dbaz'), 'http://example.com/foo-bar-baz') + + def test_illegal_characters(self): + self.assertRaisesRegexp(ValueError, 'Illegal characters in URI', urinorm, 'http://<illegal>.com/') - def runTest(self): - for case in test_data.split('\n\n'): - case = case.strip() - if not case: - continue - - desc, raw, expected = self.parse(case) - try: - actual = openid.urinorm.urinorm(raw) - except ValueError as why: - self.assertEqual(expected, 'fail', why) - else: - self.assertEqual(actual, expected, desc) - - def parse(self, full_case): - desc, case, expected = full_case.split('\n') - case = unicode(case, 'utf-8') - - return (desc, case, expected) + def test_realms(self): + # Urinorm supports OpenID realms with * in them + self.assertEqual(urinorm('http://*.example.com/'), 'http://*.example.com/') diff --git a/openid/test/urinorm.txt b/openid/test/urinorm.txt deleted file mode 100644 index a5db39e..0000000 --- a/openid/test/urinorm.txt +++ /dev/null @@ -1,87 +0,0 @@ -Already normal form -http://example.com/ -http://example.com/ - -Add a trailing slash -http://example.com -http://example.com/ - -Remove an empty port segment -http://example.com:/ -http://example.com/ - -Remove a default port segment -http://example.com:80/ -http://example.com/ - -Capitalization in host names -http://wWw.exaMPLE.COm/ -http://www.example.com/ - -Capitalization in scheme names -htTP://example.com/ -http://example.com/ - -Capitalization in percent-escaped reserved characters -http://example.com/foo%2cbar -http://example.com/foo%2Cbar - -Unescape percent-encoded unreserved characters -http://example.com/foo%2Dbar%2dbaz -http://example.com/foo-bar-baz - -remove_dot_segments example 1 -http://example.com/a/b/c/./../../g -http://example.com/a/g - -remove_dot_segments example 2 -http://example.com/mid/content=5/../6 -http://example.com/mid/6 - -remove_dot_segments: single-dot -http://example.com/a/./b -http://example.com/a/b - -remove_dot_segments: double-dot -http://example.com/a/../b -http://example.com/b - -remove_dot_segments: leading double-dot -http://example.com/../b -http://example.com/b - -remove_dot_segments: trailing single-dot -http://example.com/a/. -http://example.com/a/ - -remove_dot_segments: trailing double-dot -http://example.com/a/.. -http://example.com/ - -remove_dot_segments: trailing single-dot-slash -http://example.com/a/./ -http://example.com/a/ - -remove_dot_segments: trailing double-dot-slash -http://example.com/a/../ -http://example.com/ - -Test of all kinds of syntax-based normalization -hTTPS://a/./b/../b/%63/%7bfoo%7d -https://a/b/c/%7Bfoo%7D - -Unsupported scheme -ftp://example.com/ -fail - -Non-absolute URI -http:/foo -fail - -Illegal character in URI -http://<illegal>.com/ -fail - -Non-ascii character in URI -http://foo.com/ -fail diff --git a/openid/urinorm.py b/openid/urinorm.py index e7127d3..0da86ee 100644 --- a/openid/urinorm.py +++ b/openid/urinorm.py @@ -1,105 +1,12 @@ -import re - -# from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt) -uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?' -uri_re = re.compile(uri_pattern) - -# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" -# -# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" -# / "*" / "+" / "," / ";" / "=" -# -# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" - -uri_illegal_char_re = re.compile( - "[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re.UNICODE) - -authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?' -authority_re = re.compile(authority_pattern) - - -pct_encoded_pattern = r'%([0-9A-Fa-f]{2})' -pct_encoded_re = re.compile(pct_encoded_pattern) - -try: - unichr(0x10000) -except ValueError: - # narrow python build - UCSCHAR = [ - (0xA0, 0xD7FF), - (0xF900, 0xFDCF), - (0xFDF0, 0xFFEF), - ] - - IPRIVATE = [ - (0xE000, 0xF8FF), - ] -else: - UCSCHAR = [ - (0xA0, 0xD7FF), - (0xF900, 0xFDCF), - (0xFDF0, 0xFFEF), - (0x10000, 0x1FFFD), - (0x20000, 0x2FFFD), - (0x30000, 0x3FFFD), - (0x40000, 0x4FFFD), - (0x50000, 0x5FFFD), - (0x60000, 0x6FFFD), - (0x70000, 0x7FFFD), - (0x80000, 0x8FFFD), - (0x90000, 0x9FFFD), - (0xA0000, 0xAFFFD), - (0xB0000, 0xBFFFD), - (0xC0000, 0xCFFFD), - (0xD0000, 0xDFFFD), - (0xE1000, 0xEFFFD), - ] - - IPRIVATE = [ - (0xE000, 0xF8FF), - (0xF0000, 0xFFFFD), - (0x100000, 0x10FFFD), - ] - - -_unreserved = [False] * 256 -for _ in range(ord('A'), ord('Z') + 1): - _unreserved[_] = True -for _ in range(ord('0'), ord('9') + 1): - _unreserved[_] = True -for _ in range(ord('a'), ord('z') + 1): - _unreserved[_] = True -_unreserved[ord('-')] = True -_unreserved[ord('.')] = True -_unreserved[ord('_')] = True -_unreserved[ord('~')] = True - - -_escapeme_re = re.compile('[%s]' % ''.join(u'%s-%s' % (unichr(m_n[0]), unichr(m_n[1])) for m_n in UCSCHAR + IPRIVATE)) - - -def _pct_escape_unicode(char_match): - c = char_match.group() - return ''.join(['%%%X' % (ord(octet),) for octet in c.encode('utf-8')]) - - -def _pct_encoded_replace_unreserved(mo): - try: - i = int(mo.group(1), 16) - if _unreserved[i]: - return chr(i) - else: - return mo.group().upper() +"""URI normalization utilities.""" +from __future__ import unicode_literals - except ValueError: - return mo.group() +import string +import warnings +from urllib import quote, unquote, urlencode +from urlparse import parse_qsl, urlsplit, urlunsplit - -def _pct_encoded_replace(mo): - try: - return chr(int(mo.group(1), 16)) - except ValueError: - return mo.group() +import six def remove_dot_segments(path): @@ -137,65 +44,92 @@ def remove_dot_segments(path): return ''.join(result_segments) -def urinorm(uri): - if isinstance(uri, unicode): - uri = _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii') +GEN_DELIMS = ":" + "/" + "?" + "#" + "[" + "]" + "@" +SUB_DELIMS = "!" + "$" + "&" + "'" + "(" + ")" + "*" + "+" + "," + ";" + "=" +RESERVED = GEN_DELIMS + SUB_DELIMS +UNRESERVED = string.ascii_letters + string.digits + "-" + "." + "_" + "~" +# Allow "%" as percent encoding character +PERCENT_ENCODING_CHARACTER = "%" - illegal_mo = uri_illegal_char_re.search(uri) - if illegal_mo: - raise ValueError('Illegal characters in URI: %r at position %s' % - (illegal_mo.group(), illegal_mo.start())) - uri_mo = uri_re.match(uri) +def _check_disallowed_characters(uri_part, part_name): + # Roughly check the allowed characters. The check in not strict according to URI ABNF, but good enough. + # Also allow "%" for percent encoding. + if set(uri_part).difference(set(UNRESERVED + RESERVED + PERCENT_ENCODING_CHARACTER)): + raise ValueError('Illegal characters in URI {}: {}'.format(part_name, uri_part)) - scheme = uri_mo.group(2) - if scheme is None: - raise ValueError('No scheme specified') - scheme = scheme.lower() - if scheme not in ('http', 'https'): - raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri,)) +def urinorm(uri): + """Return normalized URI. - authority = uri_mo.group(4) - if authority is None: - raise ValueError('Not an absolute URI: %r' % (uri,)) + Normalization if performed according to RFC 3986, section 6 https://tools.ietf.org/html/rfc3986#section-6. + Supported URIs are URLs and OpenID realm URIs. - authority_mo = authority_re.match(authority) - if authority_mo is None: - raise ValueError('URI does not have a valid authority: %r' % (uri,)) + @type uri: six.text_type, six.binary_type deprecated + @rtype: six.text_type + @raise ValueError: If URI is invalid. + """ + # Transform the input to the unicode string + if isinstance(uri, six.binary_type): + warnings.warn("Binary input for urinorm is deprecated. Use text input instead.", DeprecationWarning) + uri = uri.decode('utf-8') - userinfo, host, port = authority_mo.groups() + split_uri = urlsplit(uri) - if userinfo is None: - userinfo = '' + # Normalize scheme + scheme = split_uri.scheme.lower() + if scheme not in ('http', 'https'): + raise ValueError('Not an absolute HTTP or HTTPS URI: {!r}'.format(uri)) - if '%' in host: - host = host.lower() - host = pct_encoded_re.sub(_pct_encoded_replace, host) - host = unicode(host, 'utf-8').encode('idna') - else: - host = host.lower() + # Normalize netloc + if not split_uri.netloc: + raise ValueError('Not an absolute URI: {!r}'.format(uri)) - if port: - if port == ':' or (scheme == 'http' and port == ':80') or (scheme == 'https' and port == ':443'): - port = '' + hostname = split_uri.hostname + if hostname is None: + hostname = '' else: - port = '' + hostname = hostname.lower() + # Unquote percent encoded characters + hostname = unquote(hostname) + # Quote IDN domain names + try: + hostname = hostname.encode('idna') + except ValueError as error: + raise ValueError('Invalid hostname {!r}: {}'.format(hostname, error)) + _check_disallowed_characters(hostname, 'hostname') - authority = userinfo + host + port + port = split_uri.port + if port is None: + port = '' + elif (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443): + port = '' - path = uri_mo.group(5) - path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path) + netloc = hostname + if port: + netloc = netloc + ':' + str(port) + userinfo_chunks = [i for i in (split_uri.username, split_uri.password) if i is not None] + if userinfo_chunks: + userinfo = ':'.join(userinfo_chunks) + _check_disallowed_characters(userinfo, 'userinfo') + netloc = userinfo + '@' + netloc + + # Normalize path + path = split_uri.path + # Unquote and quote - this normalizes the percent encoding + path = quote(unquote(path.encode('utf-8'))).decode('utf-8') path = remove_dot_segments(path) if not path: path = '/' + _check_disallowed_characters(path, 'path') - query = uri_mo.group(6) - if query is None: - query = '' + # Normalize query + data = parse_qsl(split_uri.query) + query = urlencode(data) + _check_disallowed_characters(query, 'query') - fragment = uri_mo.group(8) - if fragment is None: - fragment = '' + # Normalize fragment + fragment = unquote(split_uri.fragment) + _check_disallowed_characters(fragment, 'fragment') - return scheme + '://' + authority + path + query + fragment + return urlunsplit((scheme, netloc, path, query, fragment)) |