summaryrefslogtreecommitdiff
path: root/bzrlib/urlutils.py
diff options
context:
space:
mode:
Diffstat (limited to 'bzrlib/urlutils.py')
-rw-r--r--bzrlib/urlutils.py969
1 files changed, 969 insertions, 0 deletions
diff --git a/bzrlib/urlutils.py b/bzrlib/urlutils.py
new file mode 100644
index 0000000..7f6ab1d
--- /dev/null
+++ b/bzrlib/urlutils.py
@@ -0,0 +1,969 @@
+# Copyright (C) 2006-2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""A collection of function for handling URL operations."""
+
+from __future__ import absolute_import
+
+import os
+import re
+import sys
+
+from bzrlib.lazy_import import lazy_import
+lazy_import(globals(), """
+from posixpath import split as _posix_split
+import urlparse
+
+from bzrlib import (
+ errors,
+ osutils,
+ )
+""")
+
+
+def basename(url, exclude_trailing_slash=True):
+ """Return the last component of a URL.
+
+ :param url: The URL in question
+ :param exclude_trailing_slash: If the url looks like "path/to/foo/"
+ ignore the final slash and return 'foo' rather than ''
+ :return: Just the final component of the URL. This can return ''
+ if you don't exclude_trailing_slash, or if you are at the
+ root of the URL.
+ """
+ return split(url, exclude_trailing_slash=exclude_trailing_slash)[1]
+
+
+def dirname(url, exclude_trailing_slash=True):
+ """Return the parent directory of the given path.
+
+ :param url: Relative or absolute URL
+ :param exclude_trailing_slash: Remove a final slash
+ (treat http://host/foo/ as http://host/foo, but
+ http://host/ stays http://host/)
+ :return: Everything in the URL except the last path chunk
+ """
+ # TODO: jam 20060502 This was named dirname to be consistent
+ # with the os functions, but maybe "parent" would be better
+ return split(url, exclude_trailing_slash=exclude_trailing_slash)[0]
+
+
+# Private copies of quote and unquote, copied from Python's
+# urllib module because urllib unconditionally imports socket, which imports
+# ssl.
+
+always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ 'abcdefghijklmnopqrstuvwxyz'
+ '0123456789' '_.-')
+_safe_map = {}
+for i, c in zip(xrange(256), str(bytearray(xrange(256)))):
+ _safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i)
+_safe_quoters = {}
+
+
+def quote(s, safe='/'):
+ """quote('abc def') -> 'abc%20def'
+
+ Each part of a URL, e.g. the path info, the query, etc., has a
+ different set of reserved characters that must be quoted.
+
+ RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
+ the following reserved characters.
+
+ reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
+ "$" | ","
+
+ Each of these characters is reserved in some component of a URL,
+ but not necessarily in all of them.
+
+ By default, the quote function is intended for quoting the path
+ section of a URL. Thus, it will not encode '/'. This character
+ is reserved, but in typical usage the quote function is being
+ called on a path where the existing slash characters are used as
+ reserved characters.
+ """
+ # fastpath
+ if not s:
+ if s is None:
+ raise TypeError('None object cannot be quoted')
+ return s
+ cachekey = (safe, always_safe)
+ try:
+ (quoter, safe) = _safe_quoters[cachekey]
+ except KeyError:
+ safe_map = _safe_map.copy()
+ safe_map.update([(c, c) for c in safe])
+ quoter = safe_map.__getitem__
+ safe = always_safe + safe
+ _safe_quoters[cachekey] = (quoter, safe)
+ if not s.rstrip(safe):
+ return s
+ return ''.join(map(quoter, s))
+
+
+_hexdig = '0123456789ABCDEFabcdef'
+_hextochr = dict((a + b, chr(int(a + b, 16)))
+ for a in _hexdig for b in _hexdig)
+
+def unquote(s):
+ """unquote('abc%20def') -> 'abc def'."""
+ res = s.split('%')
+ # fastpath
+ if len(res) == 1:
+ return s
+ s = res[0]
+ for item in res[1:]:
+ try:
+ s += _hextochr[item[:2]] + item[2:]
+ except KeyError:
+ s += '%' + item
+ except UnicodeDecodeError:
+ s += unichr(int(item[:2], 16)) + item[2:]
+ return s
+
+
+def escape(relpath):
+ """Escape relpath to be a valid url."""
+ if isinstance(relpath, unicode):
+ relpath = relpath.encode('utf-8')
+ # After quoting and encoding, the path should be perfectly
+ # safe as a plain ASCII string, str() just enforces this
+ return str(quote(relpath, safe='/~'))
+
+
+def file_relpath(base, path):
+ """Compute just the relative sub-portion of a url
+
+ This assumes that both paths are already fully specified file:// URLs.
+ """
+ if len(base) < MIN_ABS_FILEURL_LENGTH:
+ raise ValueError('Length of base (%r) must equal or'
+ ' exceed the platform minimum url length (which is %d)' %
+ (base, MIN_ABS_FILEURL_LENGTH))
+ base = osutils.normpath(local_path_from_url(base))
+ path = osutils.normpath(local_path_from_url(path))
+ return escape(osutils.relpath(base, path))
+
+
+def _find_scheme_and_separator(url):
+ """Find the scheme separator (://) and the first path separator
+
+ This is just a helper functions for other path utilities.
+ It could probably be replaced by urlparse
+ """
+ m = _url_scheme_re.match(url)
+ if not m:
+ return None, None
+
+ scheme = m.group('scheme')
+ path = m.group('path')
+
+ # Find the path separating slash
+ # (first slash after the ://)
+ first_path_slash = path.find('/')
+ if first_path_slash == -1:
+ return len(scheme), None
+ return len(scheme), first_path_slash+m.start('path')
+
+
+def is_url(url):
+ """Tests whether a URL is in actual fact a URL."""
+ return _url_scheme_re.match(url) is not None
+
+
+def join(base, *args):
+ """Create a URL by joining sections.
+
+ This will normalize '..', assuming that paths are absolute
+ (it assumes no symlinks in either path)
+
+ If any of *args is an absolute URL, it will be treated correctly.
+ Example:
+ join('http://foo', 'http://bar') => 'http://bar'
+ join('http://foo', 'bar') => 'http://foo/bar'
+ join('http://foo', 'bar', '../baz') => 'http://foo/baz'
+ """
+ if not args:
+ return base
+ scheme_end, path_start = _find_scheme_and_separator(base)
+ if scheme_end is None and path_start is None:
+ path_start = 0
+ elif path_start is None:
+ path_start = len(base)
+ path = base[path_start:]
+ for arg in args:
+ arg_scheme_end, arg_path_start = _find_scheme_and_separator(arg)
+ if arg_scheme_end is None and arg_path_start is None:
+ arg_path_start = 0
+ elif arg_path_start is None:
+ arg_path_start = len(arg)
+ if arg_scheme_end is not None:
+ base = arg
+ path = arg[arg_path_start:]
+ scheme_end = arg_scheme_end
+ path_start = arg_path_start
+ else:
+ path = joinpath(path, arg)
+ return base[:path_start] + path
+
+
+def joinpath(base, *args):
+ """Join URL path segments to a URL path segment.
+
+ This is somewhat like osutils.joinpath, but intended for URLs.
+
+ XXX: this duplicates some normalisation logic, and also duplicates a lot of
+ path handling logic that already exists in some Transport implementations.
+ We really should try to have exactly one place in the code base responsible
+ for combining paths of URLs.
+ """
+ path = base.split('/')
+ if len(path) > 1 and path[-1] == '':
+ #If the path ends in a trailing /, remove it.
+ path.pop()
+ for arg in args:
+ if arg.startswith('/'):
+ path = []
+ for chunk in arg.split('/'):
+ if chunk == '.':
+ continue
+ elif chunk == '..':
+ if path == ['']:
+ raise errors.InvalidURLJoin('Cannot go above root',
+ base, args)
+ path.pop()
+ else:
+ path.append(chunk)
+ if path == ['']:
+ return '/'
+ else:
+ return '/'.join(path)
+
+
+# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url'
+def _posix_local_path_from_url(url):
+ """Convert a url like file:///path/to/foo into /path/to/foo"""
+ url = split_segment_parameters_raw(url)[0]
+ file_localhost_prefix = 'file://localhost/'
+ if url.startswith(file_localhost_prefix):
+ path = url[len(file_localhost_prefix) - 1:]
+ elif not url.startswith('file:///'):
+ raise errors.InvalidURL(
+ url, 'local urls must start with file:/// or file://localhost/')
+ else:
+ path = url[len('file://'):]
+ # We only strip off 2 slashes
+ return unescape(path)
+
+
+def _posix_local_path_to_url(path):
+ """Convert a local path like ./foo into a URL like file:///path/to/foo
+
+ This also handles transforming escaping unicode characters, etc.
+ """
+ # importing directly from posixpath allows us to test this
+ # on non-posix platforms
+ return 'file://' + escape(osutils._posix_abspath(path))
+
+
+def _win32_local_path_from_url(url):
+ """Convert a url like file:///C:/path/to/foo into C:/path/to/foo"""
+ if not url.startswith('file://'):
+ raise errors.InvalidURL(url, 'local urls must start with file:///, '
+ 'UNC path urls must start with file://')
+ url = split_segment_parameters_raw(url)[0]
+ # We strip off all 3 slashes
+ win32_url = url[len('file:'):]
+ # check for UNC path: //HOST/path
+ if not win32_url.startswith('///'):
+ if (win32_url[2] == '/'
+ or win32_url[3] in '|:'):
+ raise errors.InvalidURL(url, 'Win32 UNC path urls'
+ ' have form file://HOST/path')
+ return unescape(win32_url)
+
+ # allow empty paths so we can serve all roots
+ if win32_url == '///':
+ return '/'
+
+ # usual local path with drive letter
+ if (len(win32_url) < 6
+ or win32_url[3] not in ('abcdefghijklmnopqrstuvwxyz'
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+ or win32_url[4] not in '|:'
+ or win32_url[5] != '/'):
+ raise errors.InvalidURL(url, 'Win32 file urls start with'
+ ' file:///x:/, where x is a valid drive letter')
+ return win32_url[3].upper() + u':' + unescape(win32_url[5:])
+
+
+def _win32_local_path_to_url(path):
+ """Convert a local path like ./foo into a URL like file:///C:/path/to/foo
+
+ This also handles transforming escaping unicode characters, etc.
+ """
+ # importing directly from ntpath allows us to test this
+ # on non-win32 platform
+ # FIXME: It turns out that on nt, ntpath.abspath uses nt._getfullpathname
+ # which actually strips trailing space characters.
+ # The worst part is that on linux ntpath.abspath has different
+ # semantics, since 'nt' is not an available module.
+ if path == '/':
+ return 'file:///'
+
+ win32_path = osutils._win32_abspath(path)
+ # check for UNC path \\HOST\path
+ if win32_path.startswith('//'):
+ return 'file:' + escape(win32_path)
+ return ('file:///' + str(win32_path[0].upper()) + ':' +
+ escape(win32_path[2:]))
+
+
+local_path_to_url = _posix_local_path_to_url
+local_path_from_url = _posix_local_path_from_url
+MIN_ABS_FILEURL_LENGTH = len('file:///')
+WIN32_MIN_ABS_FILEURL_LENGTH = len('file:///C:/')
+
+if sys.platform == 'win32':
+ local_path_to_url = _win32_local_path_to_url
+ local_path_from_url = _win32_local_path_from_url
+
+ MIN_ABS_FILEURL_LENGTH = WIN32_MIN_ABS_FILEURL_LENGTH
+
+
+_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,}):(//)?(?P<path>.*)$')
+_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})')
+
+
+def _unescape_safe_chars(matchobj):
+ """re.sub callback to convert hex-escapes to plain characters (if safe).
+
+ e.g. '%7E' will be converted to '~'.
+ """
+ hex_digits = matchobj.group(0)[1:]
+ char = chr(int(hex_digits, 16))
+ if char in _url_dont_escape_characters:
+ return char
+ else:
+ return matchobj.group(0).upper()
+
+
+def normalize_url(url):
+ """Make sure that a path string is in fully normalized URL form.
+
+ This handles URLs which have unicode characters, spaces,
+ special characters, etc.
+
+ It has two basic modes of operation, depending on whether the
+ supplied string starts with a url specifier (scheme://) or not.
+ If it does not have a specifier it is considered a local path,
+ and will be converted into a file:/// url. Non-ascii characters
+ will be encoded using utf-8.
+ If it does have a url specifier, it will be treated as a "hybrid"
+ URL. Basically, a URL that should have URL special characters already
+ escaped (like +?&# etc), but may have unicode characters, etc
+ which would not be valid in a real URL.
+
+ :param url: Either a hybrid URL or a local path
+ :return: A normalized URL which only includes 7-bit ASCII characters.
+ """
+ scheme_end, path_start = _find_scheme_and_separator(url)
+ if scheme_end is None:
+ return local_path_to_url(url)
+ prefix = url[:path_start]
+ path = url[path_start:]
+ if not isinstance(url, unicode):
+ for c in url:
+ if c not in _url_safe_characters:
+ raise errors.InvalidURL(url, 'URLs can only contain specific'
+ ' safe characters (not %r)' % c)
+ path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
+ return str(prefix + ''.join(path))
+
+ # We have a unicode (hybrid) url
+ path_chars = list(path)
+
+ for i in xrange(len(path_chars)):
+ if path_chars[i] not in _url_safe_characters:
+ chars = path_chars[i].encode('utf-8')
+ path_chars[i] = ''.join(
+ ['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')])
+ path = ''.join(path_chars)
+ path = _url_hex_escapes_re.sub(_unescape_safe_chars, path)
+ return str(prefix + path)
+
+
+def relative_url(base, other):
+ """Return a path to other from base.
+
+ If other is unrelated to base, return other. Else return a relative path.
+ This assumes no symlinks as part of the url.
+ """
+ dummy, base_first_slash = _find_scheme_and_separator(base)
+ if base_first_slash is None:
+ return other
+
+ dummy, other_first_slash = _find_scheme_and_separator(other)
+ if other_first_slash is None:
+ return other
+
+ # this takes care of differing schemes or hosts
+ base_scheme = base[:base_first_slash]
+ other_scheme = other[:other_first_slash]
+ if base_scheme != other_scheme:
+ return other
+ elif sys.platform == 'win32' and base_scheme == 'file://':
+ base_drive = base[base_first_slash+1:base_first_slash+3]
+ other_drive = other[other_first_slash+1:other_first_slash+3]
+ if base_drive != other_drive:
+ return other
+
+ base_path = base[base_first_slash+1:]
+ other_path = other[other_first_slash+1:]
+
+ if base_path.endswith('/'):
+ base_path = base_path[:-1]
+
+ base_sections = base_path.split('/')
+ other_sections = other_path.split('/')
+
+ if base_sections == ['']:
+ base_sections = []
+ if other_sections == ['']:
+ other_sections = []
+
+ output_sections = []
+ for b, o in zip(base_sections, other_sections):
+ if b != o:
+ break
+ output_sections.append(b)
+
+ match_len = len(output_sections)
+ output_sections = ['..' for x in base_sections[match_len:]]
+ output_sections.extend(other_sections[match_len:])
+
+ return "/".join(output_sections) or "."
+
+
+def _win32_extract_drive_letter(url_base, path):
+ """On win32 the drive letter needs to be added to the url base."""
+ # Strip off the drive letter
+ # path is currently /C:/foo
+ if len(path) < 4 or path[2] not in ':|' or path[3] != '/':
+ raise errors.InvalidURL(url_base + path,
+ 'win32 file:/// paths need a drive letter')
+ url_base += path[0:3] # file:// + /C:
+ path = path[3:] # /foo
+ return url_base, path
+
+
+def split(url, exclude_trailing_slash=True):
+ """Split a URL into its parent directory and a child directory.
+
+ :param url: A relative or absolute URL
+ :param exclude_trailing_slash: Strip off a final '/' if it is part
+ of the path (but not if it is part of the protocol specification)
+
+ :return: (parent_url, child_dir). child_dir may be the empty string if we're at
+ the root.
+ """
+ scheme_loc, first_path_slash = _find_scheme_and_separator(url)
+
+ if first_path_slash is None:
+ # We have either a relative path, or no separating slash
+ if scheme_loc is None:
+ # Relative path
+ if exclude_trailing_slash and url.endswith('/'):
+ url = url[:-1]
+ return _posix_split(url)
+ else:
+ # Scheme with no path
+ return url, ''
+
+ # We have a fully defined path
+ url_base = url[:first_path_slash] # http://host, file://
+ path = url[first_path_slash:] # /file/foo
+
+ if sys.platform == 'win32' and url.startswith('file:///'):
+ # Strip off the drive letter
+ # url_base is currently file://
+ # path is currently /C:/foo
+ url_base, path = _win32_extract_drive_letter(url_base, path)
+ # now it should be file:///C: and /foo
+
+ if exclude_trailing_slash and len(path) > 1 and path.endswith('/'):
+ path = path[:-1]
+ head, tail = _posix_split(path)
+ return url_base + head, tail
+
+
+def split_segment_parameters_raw(url):
+ """Split the subsegment of the last segment of a URL.
+
+ :param url: A relative or absolute URL
+ :return: (url, subsegments)
+ """
+ # GZ 2011-11-18: Dodgy removing the terminal slash like this, function
+ # operates on urls not url+segments, and Transport classes
+ # should not be blindly adding slashes in the first place.
+ lurl = strip_trailing_slash(url)
+ # Segments begin at first comma after last forward slash, if one exists
+ segment_start = lurl.find(",", lurl.rfind("/")+1)
+ if segment_start == -1:
+ return (url, [])
+ return (lurl[:segment_start], lurl[segment_start+1:].split(","))
+
+
+def split_segment_parameters(url):
+ """Split the segment parameters of the last segment of a URL.
+
+ :param url: A relative or absolute URL
+ :return: (url, segment_parameters)
+ """
+ (base_url, subsegments) = split_segment_parameters_raw(url)
+ parameters = {}
+ for subsegment in subsegments:
+ (key, value) = subsegment.split("=", 1)
+ parameters[key] = value
+ return (base_url, parameters)
+
+
+def join_segment_parameters_raw(base, *subsegments):
+ """Create a new URL by adding subsegments to an existing one.
+
+ This adds the specified subsegments to the last path in the specified
+ base URL. The subsegments should be bytestrings.
+
+ :note: You probably want to use join_segment_parameters instead.
+ """
+ if not subsegments:
+ return base
+ for subsegment in subsegments:
+ if type(subsegment) is not str:
+ raise TypeError("Subsegment %r is not a bytestring" % subsegment)
+ if "," in subsegment:
+ raise errors.InvalidURLJoin(", exists in subsegments",
+ base, subsegments)
+ return ",".join((base,) + subsegments)
+
+
+def join_segment_parameters(url, parameters):
+ """Create a new URL by adding segment parameters to an existing one.
+
+ The parameters of the last segment in the URL will be updated; if a
+ parameter with the same key already exists it will be overwritten.
+
+ :param url: A URL, as string
+ :param parameters: Dictionary of parameters, keys and values as bytestrings
+ """
+ (base, existing_parameters) = split_segment_parameters(url)
+ new_parameters = {}
+ new_parameters.update(existing_parameters)
+ for key, value in parameters.iteritems():
+ if type(key) is not str:
+ raise TypeError("parameter key %r is not a bytestring" % key)
+ if type(value) is not str:
+ raise TypeError("parameter value %r for %s is not a bytestring" %
+ (key, value))
+ if "=" in key:
+ raise errors.InvalidURLJoin("= exists in parameter key", url,
+ parameters)
+ new_parameters[key] = value
+ return join_segment_parameters_raw(base,
+ *["%s=%s" % item for item in sorted(new_parameters.items())])
+
+
+def _win32_strip_local_trailing_slash(url):
+ """Strip slashes after the drive letter"""
+ if len(url) > WIN32_MIN_ABS_FILEURL_LENGTH:
+ return url[:-1]
+ else:
+ return url
+
+
+def strip_trailing_slash(url):
+ """Strip trailing slash, except for root paths.
+
+ The definition of 'root path' is platform-dependent.
+ This assumes that all URLs are valid netloc urls, such that they
+ form:
+ scheme://host/path
+ It searches for ://, and then refuses to remove the next '/'.
+ It can also handle relative paths
+ Examples:
+ path/to/foo => path/to/foo
+ path/to/foo/ => path/to/foo
+ http://host/path/ => http://host/path
+ http://host/path => http://host/path
+ http://host/ => http://host/
+ file:/// => file:///
+ file:///foo/ => file:///foo
+ # This is unique on win32 platforms, and is the only URL
+ # format which does it differently.
+ file:///c|/ => file:///c:/
+ """
+ if not url.endswith('/'):
+ # Nothing to do
+ return url
+ if sys.platform == 'win32' and url.startswith('file://'):
+ return _win32_strip_local_trailing_slash(url)
+
+ scheme_loc, first_path_slash = _find_scheme_and_separator(url)
+ if scheme_loc is None:
+ # This is a relative path, as it has no scheme
+ # so just chop off the last character
+ return url[:-1]
+
+ if first_path_slash is None or first_path_slash == len(url)-1:
+ # Don't chop off anything if the only slash is the path
+ # separating slash
+ return url
+
+ return url[:-1]
+
+
+def unescape(url):
+ """Unescape relpath from url format.
+
+ This returns a Unicode path from a URL
+ """
+ # jam 20060427 URLs are supposed to be ASCII only strings
+ # If they are passed in as unicode, unquote
+ # will return a UNICODE string, which actually contains
+ # utf-8 bytes. So we have to ensure that they are
+ # plain ASCII strings, or the final .decode will
+ # try to encode the UNICODE => ASCII, and then decode
+ # it into utf-8.
+ try:
+ url = str(url)
+ except UnicodeError, e:
+ raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,))
+
+ unquoted = unquote(url)
+ try:
+ unicode_path = unquoted.decode('utf-8')
+ except UnicodeError, e:
+ raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,))
+ return unicode_path
+
+
+# These are characters that if escaped, should stay that way
+_no_decode_chars = ';/?:@&=+$,#'
+_no_decode_ords = [ord(c) for c in _no_decode_chars]
+_no_decode_hex = (['%02x' % o for o in _no_decode_ords]
+ + ['%02X' % o for o in _no_decode_ords])
+_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)]
+ + [('%02X' % o, chr(o)) for o in range(256)]))
+#These entries get mapped to themselves
+_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex)
+
+# These characters shouldn't be percent-encoded, and it's always safe to
+# unencode them if they are.
+_url_dont_escape_characters = set(
+ "abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
+ "0123456789" # Numbers
+ "-._~" # Unreserved characters
+)
+
+# These characters should not be escaped
+_url_safe_characters = set(
+ "abcdefghijklmnopqrstuvwxyz" # Lowercase alpha
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha
+ "0123456789" # Numbers
+ "_.-!~*'()" # Unreserved characters
+ "/;?:@&=+$," # Reserved characters
+ "%#" # Extra reserved characters
+)
+
+def unescape_for_display(url, encoding):
+ """Decode what you can for a URL, so that we get a nice looking path.
+
+ This will turn file:// urls into local paths, and try to decode
+ any portions of a http:// style url that it can.
+
+ Any sections of the URL which can't be represented in the encoding or
+ need to stay as escapes are left alone.
+
+ :param url: A 7-bit ASCII URL
+ :param encoding: The final output encoding
+
+ :return: A unicode string which can be safely encoded into the
+ specified encoding.
+ """
+ if encoding is None:
+ raise ValueError('you cannot specify None for the display encoding')
+ if url.startswith('file://'):
+ try:
+ path = local_path_from_url(url)
+ path.encode(encoding)
+ return path
+ except UnicodeError:
+ return url
+
+ # Split into sections to try to decode utf-8
+ res = url.split('/')
+ for i in xrange(1, len(res)):
+ escaped_chunks = res[i].split('%')
+ for j in xrange(1, len(escaped_chunks)):
+ item = escaped_chunks[j]
+ try:
+ escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:]
+ except KeyError:
+ # Put back the percent symbol
+ escaped_chunks[j] = '%' + item
+ except UnicodeDecodeError:
+ escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:]
+ unescaped = ''.join(escaped_chunks)
+ try:
+ decoded = unescaped.decode('utf-8')
+ except UnicodeDecodeError:
+ # If this path segment cannot be properly utf-8 decoded
+ # after doing unescaping we will just leave it alone
+ pass
+ else:
+ try:
+ decoded.encode(encoding)
+ except UnicodeEncodeError:
+ # If this chunk cannot be encoded in the local
+ # encoding, then we should leave it alone
+ pass
+ else:
+ # Otherwise take the url decoded one
+ res[i] = decoded
+ return u'/'.join(res)
+
+
+def derive_to_location(from_location):
+ """Derive a TO_LOCATION given a FROM_LOCATION.
+
+ The normal case is a FROM_LOCATION of http://foo/bar => bar.
+ The Right Thing for some logical destinations may differ though
+ because no / may be present at all. In that case, the result is
+ the full name without the scheme indicator, e.g. lp:foo-bar => foo-bar.
+ This latter case also applies when a Windows drive
+ is used without a path, e.g. c:foo-bar => foo-bar.
+ If no /, path separator or : is found, the from_location is returned.
+ """
+ if from_location.find("/") >= 0 or from_location.find(os.sep) >= 0:
+ return os.path.basename(from_location.rstrip("/\\"))
+ else:
+ sep = from_location.find(":")
+ if sep > 0:
+ return from_location[sep+1:]
+ else:
+ return from_location
+
+
+def _is_absolute(url):
+ return (osutils.pathjoin('/foo', url) == url)
+
+
+def rebase_url(url, old_base, new_base):
+ """Convert a relative path from an old base URL to a new base URL.
+
+ The result will be a relative path.
+ Absolute paths and full URLs are returned unaltered.
+ """
+ scheme, separator = _find_scheme_and_separator(url)
+ if scheme is not None:
+ return url
+ if _is_absolute(url):
+ return url
+ old_parsed = urlparse.urlparse(old_base)
+ new_parsed = urlparse.urlparse(new_base)
+ if (old_parsed[:2]) != (new_parsed[:2]):
+ raise errors.InvalidRebaseURLs(old_base, new_base)
+ return determine_relative_path(new_parsed[2],
+ join(old_parsed[2], url))
+
+
+def determine_relative_path(from_path, to_path):
+ """Determine a relative path from from_path to to_path."""
+ from_segments = osutils.splitpath(from_path)
+ to_segments = osutils.splitpath(to_path)
+ count = -1
+ for count, (from_element, to_element) in enumerate(zip(from_segments,
+ to_segments)):
+ if from_element != to_element:
+ break
+ else:
+ count += 1
+ unique_from = from_segments[count:]
+ unique_to = to_segments[count:]
+ segments = (['..'] * len(unique_from) + unique_to)
+ if len(segments) == 0:
+ return '.'
+ return osutils.pathjoin(*segments)
+
+
+class URL(object):
+ """Parsed URL."""
+
+ def __init__(self, scheme, quoted_user, quoted_password, quoted_host,
+ port, quoted_path):
+ self.scheme = scheme
+ self.quoted_host = quoted_host
+ self.host = unquote(self.quoted_host)
+ self.quoted_user = quoted_user
+ if self.quoted_user is not None:
+ self.user = unquote(self.quoted_user)
+ else:
+ self.user = None
+ self.quoted_password = quoted_password
+ if self.quoted_password is not None:
+ self.password = unquote(self.quoted_password)
+ else:
+ self.password = None
+ self.port = port
+ self.quoted_path = _url_hex_escapes_re.sub(_unescape_safe_chars, quoted_path)
+ self.path = unquote(self.quoted_path)
+
+ def __eq__(self, other):
+ return (isinstance(other, self.__class__) and
+ self.scheme == other.scheme and
+ self.host == other.host and
+ self.user == other.user and
+ self.password == other.password and
+ self.path == other.path)
+
+ def __repr__(self):
+ return "<%s(%r, %r, %r, %r, %r, %r)>" % (
+ self.__class__.__name__,
+ self.scheme, self.quoted_user, self.quoted_password,
+ self.quoted_host, self.port, self.quoted_path)
+
+ @classmethod
+ def from_string(cls, url):
+ """Create a URL object from a string.
+
+ :param url: URL as bytestring
+ """
+ if isinstance(url, unicode):
+ raise errors.InvalidURL('should be ascii:\n%r' % url)
+ url = url.encode('utf-8')
+ (scheme, netloc, path, params,
+ query, fragment) = urlparse.urlparse(url, allow_fragments=False)
+ user = password = host = port = None
+ if '@' in netloc:
+ user, host = netloc.rsplit('@', 1)
+ if ':' in user:
+ user, password = user.split(':', 1)
+ else:
+ host = netloc
+
+ if ':' in host and not (host[0] == '[' and host[-1] == ']'):
+ # there *is* port
+ host, port = host.rsplit(':',1)
+ try:
+ port = int(port)
+ except ValueError:
+ raise errors.InvalidURL('invalid port number %s in url:\n%s' %
+ (port, url))
+ if host != "" and host[0] == '[' and host[-1] == ']': #IPv6
+ host = host[1:-1]
+
+ return cls(scheme, user, password, host, port, path)
+
+ def __str__(self):
+ netloc = self.quoted_host
+ if ":" in netloc:
+ netloc = "[%s]" % netloc
+ if self.quoted_user is not None:
+ # Note that we don't put the password back even if we
+ # have one so that it doesn't get accidentally
+ # exposed.
+ netloc = '%s@%s' % (self.quoted_user, netloc)
+ if self.port is not None:
+ netloc = '%s:%d' % (netloc, self.port)
+ return urlparse.urlunparse(
+ (self.scheme, netloc, self.quoted_path, None, None, None))
+
+ @staticmethod
+ def _combine_paths(base_path, relpath):
+ """Transform a Transport-relative path to a remote absolute path.
+
+ This does not handle substitution of ~ but does handle '..' and '.'
+ components.
+
+ Examples::
+
+ t._combine_paths('/home/sarah', 'project/foo')
+ => '/home/sarah/project/foo'
+ t._combine_paths('/home/sarah', '../../etc')
+ => '/etc'
+ t._combine_paths('/home/sarah', '/etc')
+ => '/etc'
+
+ :param base_path: base path
+ :param relpath: relative url string for relative part of remote path.
+ :return: urlencoded string for final path.
+ """
+ if not isinstance(relpath, str):
+ raise errors.InvalidURL(relpath)
+ relpath = _url_hex_escapes_re.sub(_unescape_safe_chars, relpath)
+ if relpath.startswith('/'):
+ base_parts = []
+ else:
+ base_parts = base_path.split('/')
+ if len(base_parts) > 0 and base_parts[-1] == '':
+ base_parts = base_parts[:-1]
+ for p in relpath.split('/'):
+ if p == '..':
+ if len(base_parts) == 0:
+ # In most filesystems, a request for the parent
+ # of root, just returns root.
+ continue
+ base_parts.pop()
+ elif p == '.':
+ continue # No-op
+ elif p != '':
+ base_parts.append(p)
+ path = '/'.join(base_parts)
+ if not path.startswith('/'):
+ path = '/' + path
+ return path
+
+ def clone(self, offset=None):
+ """Return a new URL for a path relative to this URL.
+
+ :param offset: A relative path, already urlencoded
+ :return: `URL` instance
+ """
+ if offset is not None:
+ relative = unescape(offset).encode('utf-8')
+ path = self._combine_paths(self.path, relative)
+ path = quote(path, safe="/~")
+ else:
+ path = self.quoted_path
+ return self.__class__(self.scheme, self.quoted_user,
+ self.quoted_password, self.quoted_host, self.port,
+ path)
+
+
+def parse_url(url):
+ """Extract the server address, the credentials and the path from the url.
+
+ user, password, host and path should be quoted if they contain reserved
+ chars.
+
+ :param url: an quoted url
+ :return: (scheme, user, password, host, port, path) tuple, all fields
+ are unquoted.
+ """
+ parsed_url = URL.from_string(url)
+ return (parsed_url.scheme, parsed_url.user, parsed_url.password,
+ parsed_url.host, parsed_url.port, parsed_url.path)