summaryrefslogtreecommitdiff
path: root/rfc3986/misc.py
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@baserock.org>2014-06-30 23:17:28 +0000
committer <>2014-10-24 11:03:41 +0000
commit9817ec3e47bca8fba9a7cac56d785e9d644f7473 (patch)
tree2eddc46197f66e1629dcdf01bf0cb543c729920c /rfc3986/misc.py
downloadpython-rfc3986-9817ec3e47bca8fba9a7cac56d785e9d644f7473.tar.gz
Imported from /home/lorry/working-area/delta_python-packages_python-rfc3986/rfc3986-0.2.0.tar.gz.HEADrfc3986-0.2.0master
Diffstat (limited to 'rfc3986/misc.py')
-rw-r--r--rfc3986/misc.py197
1 files changed, 197 insertions, 0 deletions
diff --git a/rfc3986/misc.py b/rfc3986/misc.py
new file mode 100644
index 0000000..d6205f6
--- /dev/null
+++ b/rfc3986/misc.py
@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Rackspace
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+rfc3986.misc
+~~~~~~~~~~~~
+
+This module contains important constants, patterns, and compiled regular
+expressions for parsing and validating URIs and their components.
+"""
+
+import re
+
+# These are enumerated for the named tuple used as a superclass of
+# URIReference
+URI_COMPONENTS = ['scheme', 'authority', 'path', 'query', 'fragment']
+
+important_characters = {
+ 'generic_delimiters': ":/?#[]@",
+ 'sub_delimiters': "!$&'()*+,;=",
+ # We need to escape the '*' in this case
+ 're_sub_delimiters': "!$&'()\*+,;=",
+ 'unreserved_chars': ('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
+ '012345789._~-'),
+ # We need to escape the '-' in this case:
+ 're_unreserved': 'A-Za-z0-9._~\-',
+ }
+# For details about delimiters and reserved characters, see:
+# http://tools.ietf.org/html/rfc3986#section-2.2
+GENERIC_DELIMITERS = set(important_characters['generic_delimiters'])
+SUB_DELIMITERS = set(important_characters['sub_delimiters'])
+RESERVED_CHARS = GENERIC_DELIMITERS.union(SUB_DELIMITERS)
+# For details about unreserved characters, see:
+# http://tools.ietf.org/html/rfc3986#section-2.3
+UNRESERVED_CHARS = set(important_characters['unreserved_chars'])
+NON_PCT_ENCODED = RESERVED_CHARS.union(UNRESERVED_CHARS).union('%')
+
+# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
+component_pattern_dict = {
+ 'scheme': '[^:/?#]+',
+ 'authority': '[^/?#]*',
+ 'path': '[^?#]*',
+ 'query': '[^#]*',
+ 'fragment': '.*',
+ }
+
+# See http://tools.ietf.org/html/rfc3986#appendix-B
+# In this case, we name each of the important matches so we can use
+# SRE_Match#groupdict to parse the values out if we so choose. This is also
+# modified to ignore other matches that are not important to the parsing of
+# the reference so we can also simply use SRE_Match#groups.
+expression = ('(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?'
+ '(?P<path>{path})(?:\?(?P<query>{query}))?'
+ '(?:#(?P<fragment>{fragment}))?'
+ ).format(**component_pattern_dict)
+
+URI_MATCHER = re.compile(expression)
+
+# #########################
+# Authority Matcher Section
+# #########################
+
+# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
+# The pattern for a regular name, e.g., www.google.com, api.github.com
+reg_name = '[\w\d.]+'
+# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
+ipv4 = '(\d{1,3}.){3}\d{1,3}'
+# Hexadecimal characters used in each piece of an IPv6 address
+hexdig = '[0-9A-Fa-f]{1,4}'
+# Least-significant 32 bits of an IPv6 address
+ls32 = '({hex}:{hex}|{ipv4})'.format(hex=hexdig, ipv4=ipv4)
+# Substitutions into the following patterns for IPv6 patterns defined
+# http://tools.ietf.org/html/rfc3986#page-20
+subs = {'hex': hexdig, 'ls32': ls32}
+
+# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
+# about ABNF (Augmented Backus-Naur Form) use in the comments
+variations = [
+ # 6( h16 ":" ) ls32
+ '(%(hex)s:){6}%(ls32)s' % subs,
+ # "::" 5( h16 ":" ) ls32
+ '::(%(hex)s:){5}%(ls32)s' % subs,
+ # [ h16 ] "::" 4( h16 ":" ) ls32
+ '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % subs,
+ # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
+ '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % subs,
+ # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
+ '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % subs,
+ # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
+ '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % subs,
+ # [ *4( h16 ":" ) h16 ] "::" ls32
+ '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % subs,
+ # [ *5( h16 ":" ) h16 ] "::" h16
+ '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % subs,
+ # [ *6( h16 ":" ) h16 ] "::"
+ '((%(hex)s:){0,6}%(hex)s)?::' % subs,
+ ]
+
+ipv6 = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(*variations)
+
+ipv_future = 'v[0-9A-Fa-f]+.[%s]+' % (
+ important_characters['re_unreserved'] +
+ important_characters['re_sub_delimiters'] +
+ ':')
+
+ip_literal = '\[({0}|{1})\]'.format(ipv6, ipv_future)
+
+# Pattern for matching the host piece of the authority
+HOST_PATTERN = '({0}|{1}|{2})'.format(reg_name, ipv4, ip_literal)
+
+SUBAUTHORITY_MATCHER = re.compile((
+ '^(?:(?P<userinfo>[A-Za-z0-9_.~\-%:]+)@)?' # userinfo
+ '(?P<host>{0}?)' # host
+ ':?(?P<port>\d+)?$' # port
+ ).format(HOST_PATTERN))
+
+
+# ####################
+# Path Matcher Section
+# ####################
+
+# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
+# about the path patterns defined below.
+
+# Percent encoded character values
+pct_encoded = '%[A-Fa-f0-9]{2}'
+pchar = ('([' + important_characters['re_unreserved']
+ + important_characters['re_sub_delimiters']
+ + ':@]|%s)' % pct_encoded)
+segments = {
+ 'segment': pchar + '*',
+ # Non-zero length segment
+ 'segment-nz': pchar + '+',
+ # Non-zero length segment without ":"
+ 'segment-nz-nc': pchar.replace(':', '') + '+'
+ }
+
+# Path types taken from Section 3.3 (linked above)
+path_empty = '^$'
+path_rootless = '%(segment-nz)s(/%(segment)s)*' % segments
+path_noscheme = '%(segment-nz-nc)s(/%(segment)s)*' % segments
+path_absolute = '/(%s)?' % path_rootless
+path_abempty = '(/%(segment)s)*' % segments
+
+# Matcher used to validate path components
+PATH_MATCHER = re.compile('^(%s|%s|%s|%s|%s)$' % (
+ path_abempty, path_absolute, path_noscheme, path_rootless, path_empty
+ ))
+
+
+# ##################################
+# Query and Fragment Matcher Section
+# ##################################
+
+QUERY_MATCHER = re.compile(
+ '^([/?:@' + important_characters['re_unreserved']
+ + important_characters['re_sub_delimiters']
+ + ']|%s)*$' % pct_encoded)
+
+FRAGMENT_MATCHER = QUERY_MATCHER
+
+# Scheme validation, see: http://tools.ietf.org/html/rfc3986#section-3.1
+SCHEME_MATCHER = re.compile('^[A-Za-z][A-Za-z0-9+.\-]*$')
+
+# Relative reference matcher
+
+# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
+relative_part = '(//%s%s|%s|%s|%s)' % (
+ component_pattern_dict['authority'], path_abempty, path_absolute,
+ path_noscheme, path_empty
+ )
+
+RELATIVE_REF_MATCHER = re.compile('^%s(\?%s)?(#%s)?$' % (
+ relative_part, QUERY_MATCHER.pattern, FRAGMENT_MATCHER.pattern
+ ))
+
+# See http://tools.ietf.org/html/rfc3986#section-3 for definition
+hier_part = '(//%s%s|%s|%s|%s)' % (
+ component_pattern_dict['authority'], path_abempty, path_absolute,
+ path_rootless, path_empty
+ )
+
+# See http://tools.ietf.org/html/rfc3986#section-4.3
+ABSOLUTE_URI_MATCHER = re.compile('^%s:%s(\?%s)$' % (
+ component_pattern_dict['scheme'], hier_part, QUERY_MATCHER.pattern
+ ))