openid/urinorm.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

"""URI normalization utilities."""
from __future__ import unicode_literals

import string

import six
from six.moves.urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsplit

from .oidutil import string_to_text


def remove_dot_segments(path):
    result_segments = []

    while path:
        if path.startswith('../'):
            path = path[3:]
        elif path.startswith('./'):
            path = path[2:]
        elif path.startswith('/./'):
            path = path[2:]
        elif path == '/.':
            path = '/'
        elif path.startswith('/../'):
            path = path[3:]
            if result_segments:
                result_segments.pop()
        elif path == '/..':
            path = '/'
            if result_segments:
                result_segments.pop()
        elif path == '..' or path == '.':
            path = ''
        else:
            i = 0
            if path[0] == '/':
                i = 1
            i = path.find('/', i)
            if i == -1:
                i = len(path)
            result_segments.append(path[:i])
            path = path[i:]

    return ''.join(result_segments)


GEN_DELIMS = ":" + "/" + "?" + "#" + "[" + "]" + "@"
SUB_DELIMS = "!" + "$" + "&" + "'" + "(" + ")" + "*" + "+" + "," + ";" + "="
RESERVED = GEN_DELIMS + SUB_DELIMS
UNRESERVED = string.ascii_letters + string.digits + "-" + "." + "_" + "~"
# Allow "%" as percent encoding character
PERCENT_ENCODING_CHARACTER = "%"


def _check_disallowed_characters(uri_part, part_name):
    # Roughly check the allowed characters. The check in not strict according to URI ABNF, but good enough.
    # Also allow "%" for percent encoding.
    if set(uri_part).difference(set(UNRESERVED + RESERVED + PERCENT_ENCODING_CHARACTER)):
        raise ValueError('Illegal characters in URI {}: {}'.format(part_name, uri_part))


def urinorm(uri):
    """Return normalized URI.

    Normalization if performed according to RFC 3986, section 6 https://tools.ietf.org/html/rfc3986#section-6.
    Supported URIs are URLs and OpenID realm URIs.

    @type uri: six.text_type, six.binary_type deprecated
    @rtype: six.text_type
    @raise ValueError: If URI is invalid.
    """
    uri = string_to_text(uri, "Binary input for urinorm is deprecated. Use text input instead.")

    split_uri = urlsplit(uri)

    # Normalize scheme
    scheme = split_uri.scheme.lower()
    if scheme not in ('http', 'https'):
        raise ValueError('Not an absolute HTTP or HTTPS URI: {!r}'.format(uri))

    # Normalize netloc
    if not split_uri.netloc:
        raise ValueError('Not an absolute URI: {!r}'.format(uri))

    hostname = split_uri.hostname
    if hostname is None:
        hostname = ''
    else:
        hostname = hostname.lower()
    # Unquote percent encoded characters
    hostname = unquote(hostname)
    # Quote IDN domain names
    try:
        # hostname: str --[idna]--> bytes --[utf-8]--> str
        hostname = hostname.encode('idna').decode('utf-8')
    except ValueError as error:
        raise ValueError('Invalid hostname {!r}: {}'.format(hostname, error))
    _check_disallowed_characters(hostname, 'hostname')

    try:
        port = split_uri.port
    except ValueError as error:
        raise ValueError('Invalid port in {!r}: {}'.format(split_uri.netloc, error))
    if port is None:
        port = ''
    elif (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443):
        port = ''

    netloc = hostname
    if port:
        netloc = netloc + ':' + six.text_type(port)
    userinfo_chunks = [i for i in (split_uri.username, split_uri.password) if i is not None]
    if userinfo_chunks:
        userinfo = ':'.join(userinfo_chunks)
        _check_disallowed_characters(userinfo, 'userinfo')
        netloc = userinfo + '@' + netloc

    # Normalize path
    path = split_uri.path
    # Unquote and quote - this normalizes the percent encoding

    # This is hackish. `unquote` and `quote` requires `str` in both py27 and py3+.
    if isinstance(path, str):
        # Python 3 branch
        path = quote(unquote(path), safe='/' + SUB_DELIMS)
    else:
        # Python 2 branch
        path = quote(unquote(path.encode('utf-8')), safe=('/' + SUB_DELIMS).encode('utf-8')).decode('utf-8')

    path = remove_dot_segments(path)
    if not path:
        path = '/'
    _check_disallowed_characters(path, 'path')

    # Normalize query.  On Python 2, `urlencode` without `doseq=True`
    # requires values to be convertible to native strings using `str()`.
    if isinstance(split_uri.query, str):
        # Python 3 branch
        data = parse_qsl(split_uri.query)
    else:
        # Python 2 branch
        data = parse_qsl(split_uri.query.encode('utf-8'))
    query = urlencode(data)
    _check_disallowed_characters(query, 'query')

    # Normalize fragment
    fragment = unquote(split_uri.fragment)
    _check_disallowed_characters(fragment, 'fragment')

    return urlunsplit((scheme, netloc, path, query, fragment))