From 9817ec3e47bca8fba9a7cac56d785e9d644f7473 Mon Sep 17 00:00:00 2001 From: Lorry Tar Creator Date: Mon, 30 Jun 2014 23:17:28 +0000 Subject: Imported from /home/lorry/working-area/delta_python-packages_python-rfc3986/rfc3986-0.2.0.tar.gz. --- rfc3986/misc.py | 197 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 rfc3986/misc.py (limited to 'rfc3986/misc.py') diff --git a/rfc3986/misc.py b/rfc3986/misc.py new file mode 100644 index 0000000..d6205f6 --- /dev/null +++ b/rfc3986/misc.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2014 Rackspace +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +rfc3986.misc +~~~~~~~~~~~~ + +This module contains important constants, patterns, and compiled regular +expressions for parsing and validating URIs and their components. +""" + +import re + +# These are enumerated for the named tuple used as a superclass of +# URIReference +URI_COMPONENTS = ['scheme', 'authority', 'path', 'query', 'fragment'] + +important_characters = { + 'generic_delimiters': ":/?#[]@", + 'sub_delimiters': "!$&'()*+,;=", + # We need to escape the '*' in this case + 're_sub_delimiters': "!$&'()\*+,;=", + 'unreserved_chars': ('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' + '012345789._~-'), + # We need to escape the '-' in this case: + 're_unreserved': 'A-Za-z0-9._~\-', + } +# For details about delimiters and reserved characters, see: +# http://tools.ietf.org/html/rfc3986#section-2.2 +GENERIC_DELIMITERS = set(important_characters['generic_delimiters']) +SUB_DELIMITERS = set(important_characters['sub_delimiters']) +RESERVED_CHARS = GENERIC_DELIMITERS.union(SUB_DELIMITERS) +# For details about unreserved characters, see: +# http://tools.ietf.org/html/rfc3986#section-2.3 +UNRESERVED_CHARS = set(important_characters['unreserved_chars']) +NON_PCT_ENCODED = RESERVED_CHARS.union(UNRESERVED_CHARS).union('%') + +# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B +component_pattern_dict = { + 'scheme': '[^:/?#]+', + 'authority': '[^/?#]*', + 'path': '[^?#]*', + 'query': '[^#]*', + 'fragment': '.*', + } + +# See http://tools.ietf.org/html/rfc3986#appendix-B +# In this case, we name each of the important matches so we can use +# SRE_Match#groupdict to parse the values out if we so choose. This is also +# modified to ignore other matches that are not important to the parsing of +# the reference so we can also simply use SRE_Match#groups. +expression = ('(?:(?P{scheme}):)?(?://(?P{authority}))?' + '(?P{path})(?:\?(?P{query}))?' + '(?:#(?P{fragment}))?' + ).format(**component_pattern_dict) + +URI_MATCHER = re.compile(expression) + +# ######################### +# Authority Matcher Section +# ######################### + +# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2 +# The pattern for a regular name, e.g., www.google.com, api.github.com +reg_name = '[\w\d.]+' +# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1, +ipv4 = '(\d{1,3}.){3}\d{1,3}' +# Hexadecimal characters used in each piece of an IPv6 address +hexdig = '[0-9A-Fa-f]{1,4}' +# Least-significant 32 bits of an IPv6 address +ls32 = '({hex}:{hex}|{ipv4})'.format(hex=hexdig, ipv4=ipv4) +# Substitutions into the following patterns for IPv6 patterns defined +# http://tools.ietf.org/html/rfc3986#page-20 +subs = {'hex': hexdig, 'ls32': ls32} + +# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details +# about ABNF (Augmented Backus-Naur Form) use in the comments +variations = [ + # 6( h16 ":" ) ls32 + '(%(hex)s:){6}%(ls32)s' % subs, + # "::" 5( h16 ":" ) ls32 + '::(%(hex)s:){5}%(ls32)s' % subs, + # [ h16 ] "::" 4( h16 ":" ) ls32 + '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % subs, + # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % subs, + # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % subs, + # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % subs, + # [ *4( h16 ":" ) h16 ] "::" ls32 + '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % subs, + # [ *5( h16 ":" ) h16 ] "::" h16 + '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % subs, + # [ *6( h16 ":" ) h16 ] "::" + '((%(hex)s:){0,6}%(hex)s)?::' % subs, + ] + +ipv6 = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(*variations) + +ipv_future = 'v[0-9A-Fa-f]+.[%s]+' % ( + important_characters['re_unreserved'] + + important_characters['re_sub_delimiters'] + + ':') + +ip_literal = '\[({0}|{1})\]'.format(ipv6, ipv_future) + +# Pattern for matching the host piece of the authority +HOST_PATTERN = '({0}|{1}|{2})'.format(reg_name, ipv4, ip_literal) + +SUBAUTHORITY_MATCHER = re.compile(( + '^(?:(?P[A-Za-z0-9_.~\-%:]+)@)?' # userinfo + '(?P{0}?)' # host + ':?(?P\d+)?$' # port + ).format(HOST_PATTERN)) + + +# #################### +# Path Matcher Section +# #################### + +# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information +# about the path patterns defined below. + +# Percent encoded character values +pct_encoded = '%[A-Fa-f0-9]{2}' +pchar = ('([' + important_characters['re_unreserved'] + + important_characters['re_sub_delimiters'] + + ':@]|%s)' % pct_encoded) +segments = { + 'segment': pchar + '*', + # Non-zero length segment + 'segment-nz': pchar + '+', + # Non-zero length segment without ":" + 'segment-nz-nc': pchar.replace(':', '') + '+' + } + +# Path types taken from Section 3.3 (linked above) +path_empty = '^$' +path_rootless = '%(segment-nz)s(/%(segment)s)*' % segments +path_noscheme = '%(segment-nz-nc)s(/%(segment)s)*' % segments +path_absolute = '/(%s)?' % path_rootless +path_abempty = '(/%(segment)s)*' % segments + +# Matcher used to validate path components +PATH_MATCHER = re.compile('^(%s|%s|%s|%s|%s)$' % ( + path_abempty, path_absolute, path_noscheme, path_rootless, path_empty + )) + + +# ################################## +# Query and Fragment Matcher Section +# ################################## + +QUERY_MATCHER = re.compile( + '^([/?:@' + important_characters['re_unreserved'] + + important_characters['re_sub_delimiters'] + + ']|%s)*$' % pct_encoded) + +FRAGMENT_MATCHER = QUERY_MATCHER + +# Scheme validation, see: http://tools.ietf.org/html/rfc3986#section-3.1 +SCHEME_MATCHER = re.compile('^[A-Za-z][A-Za-z0-9+.\-]*$') + +# Relative reference matcher + +# See http://tools.ietf.org/html/rfc3986#section-4.2 for details +relative_part = '(//%s%s|%s|%s|%s)' % ( + component_pattern_dict['authority'], path_abempty, path_absolute, + path_noscheme, path_empty + ) + +RELATIVE_REF_MATCHER = re.compile('^%s(\?%s)?(#%s)?$' % ( + relative_part, QUERY_MATCHER.pattern, FRAGMENT_MATCHER.pattern + )) + +# See http://tools.ietf.org/html/rfc3986#section-3 for definition +hier_part = '(//%s%s|%s|%s|%s)' % ( + component_pattern_dict['authority'], path_abempty, path_absolute, + path_rootless, path_empty + ) + +# See http://tools.ietf.org/html/rfc3986#section-4.3 +ABSOLUTE_URI_MATCHER = re.compile('^%s:%s(\?%s)$' % ( + component_pattern_dict['scheme'], hier_part, QUERY_MATCHER.pattern + )) -- cgit v1.2.1