summaryrefslogtreecommitdiff
path: root/openid
diff options
context:
space:
mode:
authorVlastimil Zíma <vlastimil.zima@nic.cz>2018-03-09 13:37:22 +0100
committerVlastimil Zíma <vlastimil.zima@nic.cz>2018-04-30 14:20:16 +0200
commita1f864ada10d00edc1e58e5ecb97fab6ed319a62 (patch)
tree71072773c20d15444bc63073863cfbcef28563f9 /openid
parentf638838a14956ca608368d2efa50e73b282a8581 (diff)
downloadopenid-a1f864ada10d00edc1e58e5ecb97fab6ed319a62.tar.gz
Refactor urinorm
Diffstat (limited to 'openid')
-rw-r--r--openid/test/data/trustroot.txt16
-rw-r--r--openid/test/test_urinorm.py95
-rw-r--r--openid/test/urinorm.txt87
-rw-r--r--openid/urinorm.py220
4 files changed, 156 insertions, 262 deletions
diff --git a/openid/test/data/trustroot.txt b/openid/test/data/trustroot.txt
index 3d948a4..f46ec08 100644
--- a/openid/test/data/trustroot.txt
+++ b/openid/test/data/trustroot.txt
@@ -3,32 +3,31 @@ Trust root parsing checking
========================================
----------------------------------------
-21: Does not parse
+20: Does not parse
----------------------------------------
baz.org
*.foo.com
http://*.schtuff.*/
ftp://foo.com
ftp://*.foo.com
-http://*.foo.com:80:90/
http:///
http://
foo.*.com
http://foo.*.com
http://www.*
http://*foo.com/
+http://.it/
+http://..it/
http://foo.com\/
http://localhost:1900foo/
http://foo.com/invalid#fragment
-http://π.pi.com/
-http://lambda.com/Λ
5
----------------------------------------
-15: Insane
+13: Insane
----------------------------------------
http://*/
https://*/
@@ -43,11 +42,9 @@ http://*.museum/
https://*.museum/
http://www.schtuffcom/
http://it/
-http://..it/
-http://.it/
----------------------------------------
-18: Sane
+21: Sane
----------------------------------------
http://*.schtuff.com./
http://*.schtuff.com/
@@ -67,6 +64,9 @@ https://foo.com/
http://kink.fm/should/be/sane
http://beta.lingu.no/
http://goathack.livejournal.org:8020/openid/login.bml
+http://*.example.com:80:90/
+http://π.pi.example.com/
+http://lambda.example.com/Λ
========================================
return_to matching
diff --git a/openid/test/test_urinorm.py b/openid/test/test_urinorm.py
index 0db74eb..50b5355 100644
--- a/openid/test/test_urinorm.py
+++ b/openid/test/test_urinorm.py
@@ -1,30 +1,77 @@
-import os
-import unittest
+# -*- coding: utf-8 -*-
+"""Tests for `openid.urinorm` module."""
+from __future__ import unicode_literals
-import openid.urinorm
+import unittest
-with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'urinorm.txt')) as test_data_file:
- test_data = test_data_file.read()
+from openid.urinorm import urinorm
class UrinormTest(unittest.TestCase):
+ """Test `urinorm` function."""
+
+ def test_normalized(self):
+ self.assertEqual(urinorm('http://example.com/'), 'http://example.com/')
+ self.assertEqual(urinorm(b'http://example.com/'), 'http://example.com/')
+
+ def test_lowercase_scheme(self):
+ self.assertEqual(urinorm('htTP://example.com/'), 'http://example.com/')
+
+ def test_unsupported_scheme(self):
+ self.assertRaisesRegexp(ValueError, 'Not an absolute HTTP or HTTPS URI', urinorm, 'ftp://example.com/')
+
+ def test_lowercase_hostname(self):
+ self.assertEqual(urinorm('http://exaMPLE.COm/'), 'http://example.com/')
+
+ def test_idn_hostname(self):
+ self.assertEqual(urinorm('http://π.example.com/'), 'http://xn--1xa.example.com/')
+
+ def test_empty_hostname(self):
+ self.assertEqual(urinorm('http://username@/'), 'http://username@/')
+
+ def test_invalid_hostname(self):
+ self.assertRaisesRegexp(ValueError, 'Invalid hostname', urinorm, 'http://.it/')
+ self.assertRaisesRegexp(ValueError, 'Invalid hostname', urinorm, 'http://..it/')
+ self.assertRaisesRegexp(ValueError, 'Not an absolute URI', urinorm, 'http:///path/')
+
+ def test_empty_port_section(self):
+ self.assertEqual(urinorm('http://example.com:/'), 'http://example.com/')
+
+ def test_default_ports(self):
+ self.assertEqual(urinorm('http://example.com:80/'), 'http://example.com/')
+ self.assertEqual(urinorm('https://example.com:443/'), 'https://example.com/')
+
+ def test_empty_path(self):
+ self.assertEqual(urinorm('http://example.com'), 'http://example.com/')
+
+ def test_path_dots(self):
+ self.assertEqual(urinorm('http://example.com/./a'), 'http://example.com/a')
+ self.assertEqual(urinorm('http://example.com/../a'), 'http://example.com/a')
+
+ self.assertEqual(urinorm('http://example.com/a/.'), 'http://example.com/a/')
+ self.assertEqual(urinorm('http://example.com/a/..'), 'http://example.com/')
+ self.assertEqual(urinorm('http://example.com/a/./'), 'http://example.com/a/')
+ self.assertEqual(urinorm('http://example.com/a/../'), 'http://example.com/')
+
+ self.assertEqual(urinorm('http://example.com/a/./b'), 'http://example.com/a/b')
+ self.assertEqual(urinorm('http://example.com/a/../b'), 'http://example.com/b')
+
+ self.assertEqual(urinorm('http://example.com/a/b/c/./../../g'), 'http://example.com/a/g')
+ self.assertEqual(urinorm('http://example.com/mid/content=5/../6'), 'http://example.com/mid/6')
+
+ def test_path_percent_encoding(self):
+ self.assertEqual(urinorm('http://example.com/'), 'http://example.com/%08')
+ self.assertEqual(urinorm('http://example.com/Λ'), 'http://example.com/%CE%9B')
+
+ def test_path_capitalize_percent_encoding(self):
+ self.assertEqual(urinorm('http://example.com/foo%2cbar'), 'http://example.com/foo%2Cbar')
+
+ def test_path_percent_decode_unreserved(self):
+ self.assertEqual(urinorm('http://example.com/foo%2Dbar%2dbaz'), 'http://example.com/foo-bar-baz')
+
+ def test_illegal_characters(self):
+ self.assertRaisesRegexp(ValueError, 'Illegal characters in URI', urinorm, 'http://<illegal>.com/')
- def runTest(self):
- for case in test_data.split('\n\n'):
- case = case.strip()
- if not case:
- continue
-
- desc, raw, expected = self.parse(case)
- try:
- actual = openid.urinorm.urinorm(raw)
- except ValueError as why:
- self.assertEqual(expected, 'fail', why)
- else:
- self.assertEqual(actual, expected, desc)
-
- def parse(self, full_case):
- desc, case, expected = full_case.split('\n')
- case = unicode(case, 'utf-8')
-
- return (desc, case, expected)
+ def test_realms(self):
+ # Urinorm supports OpenID realms with * in them
+ self.assertEqual(urinorm('http://*.example.com/'), 'http://*.example.com/')
diff --git a/openid/test/urinorm.txt b/openid/test/urinorm.txt
deleted file mode 100644
index a5db39e..0000000
--- a/openid/test/urinorm.txt
+++ /dev/null
@@ -1,87 +0,0 @@
-Already normal form
-http://example.com/
-http://example.com/
-
-Add a trailing slash
-http://example.com
-http://example.com/
-
-Remove an empty port segment
-http://example.com:/
-http://example.com/
-
-Remove a default port segment
-http://example.com:80/
-http://example.com/
-
-Capitalization in host names
-http://wWw.exaMPLE.COm/
-http://www.example.com/
-
-Capitalization in scheme names
-htTP://example.com/
-http://example.com/
-
-Capitalization in percent-escaped reserved characters
-http://example.com/foo%2cbar
-http://example.com/foo%2Cbar
-
-Unescape percent-encoded unreserved characters
-http://example.com/foo%2Dbar%2dbaz
-http://example.com/foo-bar-baz
-
-remove_dot_segments example 1
-http://example.com/a/b/c/./../../g
-http://example.com/a/g
-
-remove_dot_segments example 2
-http://example.com/mid/content=5/../6
-http://example.com/mid/6
-
-remove_dot_segments: single-dot
-http://example.com/a/./b
-http://example.com/a/b
-
-remove_dot_segments: double-dot
-http://example.com/a/../b
-http://example.com/b
-
-remove_dot_segments: leading double-dot
-http://example.com/../b
-http://example.com/b
-
-remove_dot_segments: trailing single-dot
-http://example.com/a/.
-http://example.com/a/
-
-remove_dot_segments: trailing double-dot
-http://example.com/a/..
-http://example.com/
-
-remove_dot_segments: trailing single-dot-slash
-http://example.com/a/./
-http://example.com/a/
-
-remove_dot_segments: trailing double-dot-slash
-http://example.com/a/../
-http://example.com/
-
-Test of all kinds of syntax-based normalization
-hTTPS://a/./b/../b/%63/%7bfoo%7d
-https://a/b/c/%7Bfoo%7D
-
-Unsupported scheme
-ftp://example.com/
-fail
-
-Non-absolute URI
-http:/foo
-fail
-
-Illegal character in URI
-http://<illegal>.com/
-fail
-
-Non-ascii character in URI
-http://foo.com/
-fail
diff --git a/openid/urinorm.py b/openid/urinorm.py
index e7127d3..0da86ee 100644
--- a/openid/urinorm.py
+++ b/openid/urinorm.py
@@ -1,105 +1,12 @@
-import re
-
-# from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt)
-uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
-uri_re = re.compile(uri_pattern)
-
-# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
-#
-# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
-# / "*" / "+" / "," / ";" / "="
-#
-# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
-
-uri_illegal_char_re = re.compile(
- "[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re.UNICODE)
-
-authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?'
-authority_re = re.compile(authority_pattern)
-
-
-pct_encoded_pattern = r'%([0-9A-Fa-f]{2})'
-pct_encoded_re = re.compile(pct_encoded_pattern)
-
-try:
- unichr(0x10000)
-except ValueError:
- # narrow python build
- UCSCHAR = [
- (0xA0, 0xD7FF),
- (0xF900, 0xFDCF),
- (0xFDF0, 0xFFEF),
- ]
-
- IPRIVATE = [
- (0xE000, 0xF8FF),
- ]
-else:
- UCSCHAR = [
- (0xA0, 0xD7FF),
- (0xF900, 0xFDCF),
- (0xFDF0, 0xFFEF),
- (0x10000, 0x1FFFD),
- (0x20000, 0x2FFFD),
- (0x30000, 0x3FFFD),
- (0x40000, 0x4FFFD),
- (0x50000, 0x5FFFD),
- (0x60000, 0x6FFFD),
- (0x70000, 0x7FFFD),
- (0x80000, 0x8FFFD),
- (0x90000, 0x9FFFD),
- (0xA0000, 0xAFFFD),
- (0xB0000, 0xBFFFD),
- (0xC0000, 0xCFFFD),
- (0xD0000, 0xDFFFD),
- (0xE1000, 0xEFFFD),
- ]
-
- IPRIVATE = [
- (0xE000, 0xF8FF),
- (0xF0000, 0xFFFFD),
- (0x100000, 0x10FFFD),
- ]
-
-
-_unreserved = [False] * 256
-for _ in range(ord('A'), ord('Z') + 1):
- _unreserved[_] = True
-for _ in range(ord('0'), ord('9') + 1):
- _unreserved[_] = True
-for _ in range(ord('a'), ord('z') + 1):
- _unreserved[_] = True
-_unreserved[ord('-')] = True
-_unreserved[ord('.')] = True
-_unreserved[ord('_')] = True
-_unreserved[ord('~')] = True
-
-
-_escapeme_re = re.compile('[%s]' % ''.join(u'%s-%s' % (unichr(m_n[0]), unichr(m_n[1])) for m_n in UCSCHAR + IPRIVATE))
-
-
-def _pct_escape_unicode(char_match):
- c = char_match.group()
- return ''.join(['%%%X' % (ord(octet),) for octet in c.encode('utf-8')])
-
-
-def _pct_encoded_replace_unreserved(mo):
- try:
- i = int(mo.group(1), 16)
- if _unreserved[i]:
- return chr(i)
- else:
- return mo.group().upper()
+"""URI normalization utilities."""
+from __future__ import unicode_literals
- except ValueError:
- return mo.group()
+import string
+import warnings
+from urllib import quote, unquote, urlencode
+from urlparse import parse_qsl, urlsplit, urlunsplit
-
-def _pct_encoded_replace(mo):
- try:
- return chr(int(mo.group(1), 16))
- except ValueError:
- return mo.group()
+import six
def remove_dot_segments(path):
@@ -137,65 +44,92 @@ def remove_dot_segments(path):
return ''.join(result_segments)
-def urinorm(uri):
- if isinstance(uri, unicode):
- uri = _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii')
+GEN_DELIMS = ":" + "/" + "?" + "#" + "[" + "]" + "@"
+SUB_DELIMS = "!" + "$" + "&" + "'" + "(" + ")" + "*" + "+" + "," + ";" + "="
+RESERVED = GEN_DELIMS + SUB_DELIMS
+UNRESERVED = string.ascii_letters + string.digits + "-" + "." + "_" + "~"
+# Allow "%" as percent encoding character
+PERCENT_ENCODING_CHARACTER = "%"
- illegal_mo = uri_illegal_char_re.search(uri)
- if illegal_mo:
- raise ValueError('Illegal characters in URI: %r at position %s' %
- (illegal_mo.group(), illegal_mo.start()))
- uri_mo = uri_re.match(uri)
+def _check_disallowed_characters(uri_part, part_name):
+ # Roughly check the allowed characters. The check in not strict according to URI ABNF, but good enough.
+ # Also allow "%" for percent encoding.
+ if set(uri_part).difference(set(UNRESERVED + RESERVED + PERCENT_ENCODING_CHARACTER)):
+ raise ValueError('Illegal characters in URI {}: {}'.format(part_name, uri_part))
- scheme = uri_mo.group(2)
- if scheme is None:
- raise ValueError('No scheme specified')
- scheme = scheme.lower()
- if scheme not in ('http', 'https'):
- raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri,))
+def urinorm(uri):
+ """Return normalized URI.
- authority = uri_mo.group(4)
- if authority is None:
- raise ValueError('Not an absolute URI: %r' % (uri,))
+ Normalization if performed according to RFC 3986, section 6 https://tools.ietf.org/html/rfc3986#section-6.
+ Supported URIs are URLs and OpenID realm URIs.
- authority_mo = authority_re.match(authority)
- if authority_mo is None:
- raise ValueError('URI does not have a valid authority: %r' % (uri,))
+ @type uri: six.text_type, six.binary_type deprecated
+ @rtype: six.text_type
+ @raise ValueError: If URI is invalid.
+ """
+ # Transform the input to the unicode string
+ if isinstance(uri, six.binary_type):
+ warnings.warn("Binary input for urinorm is deprecated. Use text input instead.", DeprecationWarning)
+ uri = uri.decode('utf-8')
- userinfo, host, port = authority_mo.groups()
+ split_uri = urlsplit(uri)
- if userinfo is None:
- userinfo = ''
+ # Normalize scheme
+ scheme = split_uri.scheme.lower()
+ if scheme not in ('http', 'https'):
+ raise ValueError('Not an absolute HTTP or HTTPS URI: {!r}'.format(uri))
- if '%' in host:
- host = host.lower()
- host = pct_encoded_re.sub(_pct_encoded_replace, host)
- host = unicode(host, 'utf-8').encode('idna')
- else:
- host = host.lower()
+ # Normalize netloc
+ if not split_uri.netloc:
+ raise ValueError('Not an absolute URI: {!r}'.format(uri))
- if port:
- if port == ':' or (scheme == 'http' and port == ':80') or (scheme == 'https' and port == ':443'):
- port = ''
+ hostname = split_uri.hostname
+ if hostname is None:
+ hostname = ''
else:
- port = ''
+ hostname = hostname.lower()
+ # Unquote percent encoded characters
+ hostname = unquote(hostname)
+ # Quote IDN domain names
+ try:
+ hostname = hostname.encode('idna')
+ except ValueError as error:
+ raise ValueError('Invalid hostname {!r}: {}'.format(hostname, error))
+ _check_disallowed_characters(hostname, 'hostname')
- authority = userinfo + host + port
+ port = split_uri.port
+ if port is None:
+ port = ''
+ elif (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443):
+ port = ''
- path = uri_mo.group(5)
- path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path)
+ netloc = hostname
+ if port:
+ netloc = netloc + ':' + str(port)
+ userinfo_chunks = [i for i in (split_uri.username, split_uri.password) if i is not None]
+ if userinfo_chunks:
+ userinfo = ':'.join(userinfo_chunks)
+ _check_disallowed_characters(userinfo, 'userinfo')
+ netloc = userinfo + '@' + netloc
+
+ # Normalize path
+ path = split_uri.path
+ # Unquote and quote - this normalizes the percent encoding
+ path = quote(unquote(path.encode('utf-8'))).decode('utf-8')
path = remove_dot_segments(path)
if not path:
path = '/'
+ _check_disallowed_characters(path, 'path')
- query = uri_mo.group(6)
- if query is None:
- query = ''
+ # Normalize query
+ data = parse_qsl(split_uri.query)
+ query = urlencode(data)
+ _check_disallowed_characters(query, 'query')
- fragment = uri_mo.group(8)
- if fragment is None:
- fragment = ''
+ # Normalize fragment
+ fragment = unquote(split_uri.fragment)
+ _check_disallowed_characters(fragment, 'fragment')
- return scheme + '://' + authority + path + query + fragment
+ return urlunsplit((scheme, netloc, path, query, fragment))