diff options
Diffstat (limited to 'bzrlib/urlutils.py')
-rw-r--r-- | bzrlib/urlutils.py | 969 |
1 files changed, 969 insertions, 0 deletions
diff --git a/bzrlib/urlutils.py b/bzrlib/urlutils.py new file mode 100644 index 0000000..7f6ab1d --- /dev/null +++ b/bzrlib/urlutils.py @@ -0,0 +1,969 @@ +# Copyright (C) 2006-2010 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +"""A collection of function for handling URL operations.""" + +from __future__ import absolute_import + +import os +import re +import sys + +from bzrlib.lazy_import import lazy_import +lazy_import(globals(), """ +from posixpath import split as _posix_split +import urlparse + +from bzrlib import ( + errors, + osutils, + ) +""") + + +def basename(url, exclude_trailing_slash=True): + """Return the last component of a URL. + + :param url: The URL in question + :param exclude_trailing_slash: If the url looks like "path/to/foo/" + ignore the final slash and return 'foo' rather than '' + :return: Just the final component of the URL. This can return '' + if you don't exclude_trailing_slash, or if you are at the + root of the URL. + """ + return split(url, exclude_trailing_slash=exclude_trailing_slash)[1] + + +def dirname(url, exclude_trailing_slash=True): + """Return the parent directory of the given path. + + :param url: Relative or absolute URL + :param exclude_trailing_slash: Remove a final slash + (treat http://host/foo/ as http://host/foo, but + http://host/ stays http://host/) + :return: Everything in the URL except the last path chunk + """ + # TODO: jam 20060502 This was named dirname to be consistent + # with the os functions, but maybe "parent" would be better + return split(url, exclude_trailing_slash=exclude_trailing_slash)[0] + + +# Private copies of quote and unquote, copied from Python's +# urllib module because urllib unconditionally imports socket, which imports +# ssl. + +always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-') +_safe_map = {} +for i, c in zip(xrange(256), str(bytearray(xrange(256)))): + _safe_map[c] = c if (i < 128 and c in always_safe) else '%{0:02X}'.format(i) +_safe_quoters = {} + + +def quote(s, safe='/'): + """quote('abc def') -> 'abc%20def' + + Each part of a URL, e.g. the path info, the query, etc., has a + different set of reserved characters that must be quoted. + + RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + the following reserved characters. + + reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | + "$" | "," + + Each of these characters is reserved in some component of a URL, + but not necessarily in all of them. + + By default, the quote function is intended for quoting the path + section of a URL. Thus, it will not encode '/'. This character + is reserved, but in typical usage the quote function is being + called on a path where the existing slash characters are used as + reserved characters. + """ + # fastpath + if not s: + if s is None: + raise TypeError('None object cannot be quoted') + return s + cachekey = (safe, always_safe) + try: + (quoter, safe) = _safe_quoters[cachekey] + except KeyError: + safe_map = _safe_map.copy() + safe_map.update([(c, c) for c in safe]) + quoter = safe_map.__getitem__ + safe = always_safe + safe + _safe_quoters[cachekey] = (quoter, safe) + if not s.rstrip(safe): + return s + return ''.join(map(quoter, s)) + + +_hexdig = '0123456789ABCDEFabcdef' +_hextochr = dict((a + b, chr(int(a + b, 16))) + for a in _hexdig for b in _hexdig) + +def unquote(s): + """unquote('abc%20def') -> 'abc def'.""" + res = s.split('%') + # fastpath + if len(res) == 1: + return s + s = res[0] + for item in res[1:]: + try: + s += _hextochr[item[:2]] + item[2:] + except KeyError: + s += '%' + item + except UnicodeDecodeError: + s += unichr(int(item[:2], 16)) + item[2:] + return s + + +def escape(relpath): + """Escape relpath to be a valid url.""" + if isinstance(relpath, unicode): + relpath = relpath.encode('utf-8') + # After quoting and encoding, the path should be perfectly + # safe as a plain ASCII string, str() just enforces this + return str(quote(relpath, safe='/~')) + + +def file_relpath(base, path): + """Compute just the relative sub-portion of a url + + This assumes that both paths are already fully specified file:// URLs. + """ + if len(base) < MIN_ABS_FILEURL_LENGTH: + raise ValueError('Length of base (%r) must equal or' + ' exceed the platform minimum url length (which is %d)' % + (base, MIN_ABS_FILEURL_LENGTH)) + base = osutils.normpath(local_path_from_url(base)) + path = osutils.normpath(local_path_from_url(path)) + return escape(osutils.relpath(base, path)) + + +def _find_scheme_and_separator(url): + """Find the scheme separator (://) and the first path separator + + This is just a helper functions for other path utilities. + It could probably be replaced by urlparse + """ + m = _url_scheme_re.match(url) + if not m: + return None, None + + scheme = m.group('scheme') + path = m.group('path') + + # Find the path separating slash + # (first slash after the ://) + first_path_slash = path.find('/') + if first_path_slash == -1: + return len(scheme), None + return len(scheme), first_path_slash+m.start('path') + + +def is_url(url): + """Tests whether a URL is in actual fact a URL.""" + return _url_scheme_re.match(url) is not None + + +def join(base, *args): + """Create a URL by joining sections. + + This will normalize '..', assuming that paths are absolute + (it assumes no symlinks in either path) + + If any of *args is an absolute URL, it will be treated correctly. + Example: + join('http://foo', 'http://bar') => 'http://bar' + join('http://foo', 'bar') => 'http://foo/bar' + join('http://foo', 'bar', '../baz') => 'http://foo/baz' + """ + if not args: + return base + scheme_end, path_start = _find_scheme_and_separator(base) + if scheme_end is None and path_start is None: + path_start = 0 + elif path_start is None: + path_start = len(base) + path = base[path_start:] + for arg in args: + arg_scheme_end, arg_path_start = _find_scheme_and_separator(arg) + if arg_scheme_end is None and arg_path_start is None: + arg_path_start = 0 + elif arg_path_start is None: + arg_path_start = len(arg) + if arg_scheme_end is not None: + base = arg + path = arg[arg_path_start:] + scheme_end = arg_scheme_end + path_start = arg_path_start + else: + path = joinpath(path, arg) + return base[:path_start] + path + + +def joinpath(base, *args): + """Join URL path segments to a URL path segment. + + This is somewhat like osutils.joinpath, but intended for URLs. + + XXX: this duplicates some normalisation logic, and also duplicates a lot of + path handling logic that already exists in some Transport implementations. + We really should try to have exactly one place in the code base responsible + for combining paths of URLs. + """ + path = base.split('/') + if len(path) > 1 and path[-1] == '': + #If the path ends in a trailing /, remove it. + path.pop() + for arg in args: + if arg.startswith('/'): + path = [] + for chunk in arg.split('/'): + if chunk == '.': + continue + elif chunk == '..': + if path == ['']: + raise errors.InvalidURLJoin('Cannot go above root', + base, args) + path.pop() + else: + path.append(chunk) + if path == ['']: + return '/' + else: + return '/'.join(path) + + +# jam 20060502 Sorted to 'l' because the final target is 'local_path_from_url' +def _posix_local_path_from_url(url): + """Convert a url like file:///path/to/foo into /path/to/foo""" + url = split_segment_parameters_raw(url)[0] + file_localhost_prefix = 'file://localhost/' + if url.startswith(file_localhost_prefix): + path = url[len(file_localhost_prefix) - 1:] + elif not url.startswith('file:///'): + raise errors.InvalidURL( + url, 'local urls must start with file:/// or file://localhost/') + else: + path = url[len('file://'):] + # We only strip off 2 slashes + return unescape(path) + + +def _posix_local_path_to_url(path): + """Convert a local path like ./foo into a URL like file:///path/to/foo + + This also handles transforming escaping unicode characters, etc. + """ + # importing directly from posixpath allows us to test this + # on non-posix platforms + return 'file://' + escape(osutils._posix_abspath(path)) + + +def _win32_local_path_from_url(url): + """Convert a url like file:///C:/path/to/foo into C:/path/to/foo""" + if not url.startswith('file://'): + raise errors.InvalidURL(url, 'local urls must start with file:///, ' + 'UNC path urls must start with file://') + url = split_segment_parameters_raw(url)[0] + # We strip off all 3 slashes + win32_url = url[len('file:'):] + # check for UNC path: //HOST/path + if not win32_url.startswith('///'): + if (win32_url[2] == '/' + or win32_url[3] in '|:'): + raise errors.InvalidURL(url, 'Win32 UNC path urls' + ' have form file://HOST/path') + return unescape(win32_url) + + # allow empty paths so we can serve all roots + if win32_url == '///': + return '/' + + # usual local path with drive letter + if (len(win32_url) < 6 + or win32_url[3] not in ('abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') + or win32_url[4] not in '|:' + or win32_url[5] != '/'): + raise errors.InvalidURL(url, 'Win32 file urls start with' + ' file:///x:/, where x is a valid drive letter') + return win32_url[3].upper() + u':' + unescape(win32_url[5:]) + + +def _win32_local_path_to_url(path): + """Convert a local path like ./foo into a URL like file:///C:/path/to/foo + + This also handles transforming escaping unicode characters, etc. + """ + # importing directly from ntpath allows us to test this + # on non-win32 platform + # FIXME: It turns out that on nt, ntpath.abspath uses nt._getfullpathname + # which actually strips trailing space characters. + # The worst part is that on linux ntpath.abspath has different + # semantics, since 'nt' is not an available module. + if path == '/': + return 'file:///' + + win32_path = osutils._win32_abspath(path) + # check for UNC path \\HOST\path + if win32_path.startswith('//'): + return 'file:' + escape(win32_path) + return ('file:///' + str(win32_path[0].upper()) + ':' + + escape(win32_path[2:])) + + +local_path_to_url = _posix_local_path_to_url +local_path_from_url = _posix_local_path_from_url +MIN_ABS_FILEURL_LENGTH = len('file:///') +WIN32_MIN_ABS_FILEURL_LENGTH = len('file:///C:/') + +if sys.platform == 'win32': + local_path_to_url = _win32_local_path_to_url + local_path_from_url = _win32_local_path_from_url + + MIN_ABS_FILEURL_LENGTH = WIN32_MIN_ABS_FILEURL_LENGTH + + +_url_scheme_re = re.compile(r'^(?P<scheme>[^:/]{2,}):(//)?(?P<path>.*)$') +_url_hex_escapes_re = re.compile(r'(%[0-9a-fA-F]{2})') + + +def _unescape_safe_chars(matchobj): + """re.sub callback to convert hex-escapes to plain characters (if safe). + + e.g. '%7E' will be converted to '~'. + """ + hex_digits = matchobj.group(0)[1:] + char = chr(int(hex_digits, 16)) + if char in _url_dont_escape_characters: + return char + else: + return matchobj.group(0).upper() + + +def normalize_url(url): + """Make sure that a path string is in fully normalized URL form. + + This handles URLs which have unicode characters, spaces, + special characters, etc. + + It has two basic modes of operation, depending on whether the + supplied string starts with a url specifier (scheme://) or not. + If it does not have a specifier it is considered a local path, + and will be converted into a file:/// url. Non-ascii characters + will be encoded using utf-8. + If it does have a url specifier, it will be treated as a "hybrid" + URL. Basically, a URL that should have URL special characters already + escaped (like +?&# etc), but may have unicode characters, etc + which would not be valid in a real URL. + + :param url: Either a hybrid URL or a local path + :return: A normalized URL which only includes 7-bit ASCII characters. + """ + scheme_end, path_start = _find_scheme_and_separator(url) + if scheme_end is None: + return local_path_to_url(url) + prefix = url[:path_start] + path = url[path_start:] + if not isinstance(url, unicode): + for c in url: + if c not in _url_safe_characters: + raise errors.InvalidURL(url, 'URLs can only contain specific' + ' safe characters (not %r)' % c) + path = _url_hex_escapes_re.sub(_unescape_safe_chars, path) + return str(prefix + ''.join(path)) + + # We have a unicode (hybrid) url + path_chars = list(path) + + for i in xrange(len(path_chars)): + if path_chars[i] not in _url_safe_characters: + chars = path_chars[i].encode('utf-8') + path_chars[i] = ''.join( + ['%%%02X' % ord(c) for c in path_chars[i].encode('utf-8')]) + path = ''.join(path_chars) + path = _url_hex_escapes_re.sub(_unescape_safe_chars, path) + return str(prefix + path) + + +def relative_url(base, other): + """Return a path to other from base. + + If other is unrelated to base, return other. Else return a relative path. + This assumes no symlinks as part of the url. + """ + dummy, base_first_slash = _find_scheme_and_separator(base) + if base_first_slash is None: + return other + + dummy, other_first_slash = _find_scheme_and_separator(other) + if other_first_slash is None: + return other + + # this takes care of differing schemes or hosts + base_scheme = base[:base_first_slash] + other_scheme = other[:other_first_slash] + if base_scheme != other_scheme: + return other + elif sys.platform == 'win32' and base_scheme == 'file://': + base_drive = base[base_first_slash+1:base_first_slash+3] + other_drive = other[other_first_slash+1:other_first_slash+3] + if base_drive != other_drive: + return other + + base_path = base[base_first_slash+1:] + other_path = other[other_first_slash+1:] + + if base_path.endswith('/'): + base_path = base_path[:-1] + + base_sections = base_path.split('/') + other_sections = other_path.split('/') + + if base_sections == ['']: + base_sections = [] + if other_sections == ['']: + other_sections = [] + + output_sections = [] + for b, o in zip(base_sections, other_sections): + if b != o: + break + output_sections.append(b) + + match_len = len(output_sections) + output_sections = ['..' for x in base_sections[match_len:]] + output_sections.extend(other_sections[match_len:]) + + return "/".join(output_sections) or "." + + +def _win32_extract_drive_letter(url_base, path): + """On win32 the drive letter needs to be added to the url base.""" + # Strip off the drive letter + # path is currently /C:/foo + if len(path) < 4 or path[2] not in ':|' or path[3] != '/': + raise errors.InvalidURL(url_base + path, + 'win32 file:/// paths need a drive letter') + url_base += path[0:3] # file:// + /C: + path = path[3:] # /foo + return url_base, path + + +def split(url, exclude_trailing_slash=True): + """Split a URL into its parent directory and a child directory. + + :param url: A relative or absolute URL + :param exclude_trailing_slash: Strip off a final '/' if it is part + of the path (but not if it is part of the protocol specification) + + :return: (parent_url, child_dir). child_dir may be the empty string if we're at + the root. + """ + scheme_loc, first_path_slash = _find_scheme_and_separator(url) + + if first_path_slash is None: + # We have either a relative path, or no separating slash + if scheme_loc is None: + # Relative path + if exclude_trailing_slash and url.endswith('/'): + url = url[:-1] + return _posix_split(url) + else: + # Scheme with no path + return url, '' + + # We have a fully defined path + url_base = url[:first_path_slash] # http://host, file:// + path = url[first_path_slash:] # /file/foo + + if sys.platform == 'win32' and url.startswith('file:///'): + # Strip off the drive letter + # url_base is currently file:// + # path is currently /C:/foo + url_base, path = _win32_extract_drive_letter(url_base, path) + # now it should be file:///C: and /foo + + if exclude_trailing_slash and len(path) > 1 and path.endswith('/'): + path = path[:-1] + head, tail = _posix_split(path) + return url_base + head, tail + + +def split_segment_parameters_raw(url): + """Split the subsegment of the last segment of a URL. + + :param url: A relative or absolute URL + :return: (url, subsegments) + """ + # GZ 2011-11-18: Dodgy removing the terminal slash like this, function + # operates on urls not url+segments, and Transport classes + # should not be blindly adding slashes in the first place. + lurl = strip_trailing_slash(url) + # Segments begin at first comma after last forward slash, if one exists + segment_start = lurl.find(",", lurl.rfind("/")+1) + if segment_start == -1: + return (url, []) + return (lurl[:segment_start], lurl[segment_start+1:].split(",")) + + +def split_segment_parameters(url): + """Split the segment parameters of the last segment of a URL. + + :param url: A relative or absolute URL + :return: (url, segment_parameters) + """ + (base_url, subsegments) = split_segment_parameters_raw(url) + parameters = {} + for subsegment in subsegments: + (key, value) = subsegment.split("=", 1) + parameters[key] = value + return (base_url, parameters) + + +def join_segment_parameters_raw(base, *subsegments): + """Create a new URL by adding subsegments to an existing one. + + This adds the specified subsegments to the last path in the specified + base URL. The subsegments should be bytestrings. + + :note: You probably want to use join_segment_parameters instead. + """ + if not subsegments: + return base + for subsegment in subsegments: + if type(subsegment) is not str: + raise TypeError("Subsegment %r is not a bytestring" % subsegment) + if "," in subsegment: + raise errors.InvalidURLJoin(", exists in subsegments", + base, subsegments) + return ",".join((base,) + subsegments) + + +def join_segment_parameters(url, parameters): + """Create a new URL by adding segment parameters to an existing one. + + The parameters of the last segment in the URL will be updated; if a + parameter with the same key already exists it will be overwritten. + + :param url: A URL, as string + :param parameters: Dictionary of parameters, keys and values as bytestrings + """ + (base, existing_parameters) = split_segment_parameters(url) + new_parameters = {} + new_parameters.update(existing_parameters) + for key, value in parameters.iteritems(): + if type(key) is not str: + raise TypeError("parameter key %r is not a bytestring" % key) + if type(value) is not str: + raise TypeError("parameter value %r for %s is not a bytestring" % + (key, value)) + if "=" in key: + raise errors.InvalidURLJoin("= exists in parameter key", url, + parameters) + new_parameters[key] = value + return join_segment_parameters_raw(base, + *["%s=%s" % item for item in sorted(new_parameters.items())]) + + +def _win32_strip_local_trailing_slash(url): + """Strip slashes after the drive letter""" + if len(url) > WIN32_MIN_ABS_FILEURL_LENGTH: + return url[:-1] + else: + return url + + +def strip_trailing_slash(url): + """Strip trailing slash, except for root paths. + + The definition of 'root path' is platform-dependent. + This assumes that all URLs are valid netloc urls, such that they + form: + scheme://host/path + It searches for ://, and then refuses to remove the next '/'. + It can also handle relative paths + Examples: + path/to/foo => path/to/foo + path/to/foo/ => path/to/foo + http://host/path/ => http://host/path + http://host/path => http://host/path + http://host/ => http://host/ + file:/// => file:/// + file:///foo/ => file:///foo + # This is unique on win32 platforms, and is the only URL + # format which does it differently. + file:///c|/ => file:///c:/ + """ + if not url.endswith('/'): + # Nothing to do + return url + if sys.platform == 'win32' and url.startswith('file://'): + return _win32_strip_local_trailing_slash(url) + + scheme_loc, first_path_slash = _find_scheme_and_separator(url) + if scheme_loc is None: + # This is a relative path, as it has no scheme + # so just chop off the last character + return url[:-1] + + if first_path_slash is None or first_path_slash == len(url)-1: + # Don't chop off anything if the only slash is the path + # separating slash + return url + + return url[:-1] + + +def unescape(url): + """Unescape relpath from url format. + + This returns a Unicode path from a URL + """ + # jam 20060427 URLs are supposed to be ASCII only strings + # If they are passed in as unicode, unquote + # will return a UNICODE string, which actually contains + # utf-8 bytes. So we have to ensure that they are + # plain ASCII strings, or the final .decode will + # try to encode the UNICODE => ASCII, and then decode + # it into utf-8. + try: + url = str(url) + except UnicodeError, e: + raise errors.InvalidURL(url, 'URL was not a plain ASCII url: %s' % (e,)) + + unquoted = unquote(url) + try: + unicode_path = unquoted.decode('utf-8') + except UnicodeError, e: + raise errors.InvalidURL(url, 'Unable to encode the URL as utf-8: %s' % (e,)) + return unicode_path + + +# These are characters that if escaped, should stay that way +_no_decode_chars = ';/?:@&=+$,#' +_no_decode_ords = [ord(c) for c in _no_decode_chars] +_no_decode_hex = (['%02x' % o for o in _no_decode_ords] + + ['%02X' % o for o in _no_decode_ords]) +_hex_display_map = dict(([('%02x' % o, chr(o)) for o in range(256)] + + [('%02X' % o, chr(o)) for o in range(256)])) +#These entries get mapped to themselves +_hex_display_map.update((hex,'%'+hex) for hex in _no_decode_hex) + +# These characters shouldn't be percent-encoded, and it's always safe to +# unencode them if they are. +_url_dont_escape_characters = set( + "abcdefghijklmnopqrstuvwxyz" # Lowercase alpha + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha + "0123456789" # Numbers + "-._~" # Unreserved characters +) + +# These characters should not be escaped +_url_safe_characters = set( + "abcdefghijklmnopqrstuvwxyz" # Lowercase alpha + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Uppercase alpha + "0123456789" # Numbers + "_.-!~*'()" # Unreserved characters + "/;?:@&=+$," # Reserved characters + "%#" # Extra reserved characters +) + +def unescape_for_display(url, encoding): + """Decode what you can for a URL, so that we get a nice looking path. + + This will turn file:// urls into local paths, and try to decode + any portions of a http:// style url that it can. + + Any sections of the URL which can't be represented in the encoding or + need to stay as escapes are left alone. + + :param url: A 7-bit ASCII URL + :param encoding: The final output encoding + + :return: A unicode string which can be safely encoded into the + specified encoding. + """ + if encoding is None: + raise ValueError('you cannot specify None for the display encoding') + if url.startswith('file://'): + try: + path = local_path_from_url(url) + path.encode(encoding) + return path + except UnicodeError: + return url + + # Split into sections to try to decode utf-8 + res = url.split('/') + for i in xrange(1, len(res)): + escaped_chunks = res[i].split('%') + for j in xrange(1, len(escaped_chunks)): + item = escaped_chunks[j] + try: + escaped_chunks[j] = _hex_display_map[item[:2]] + item[2:] + except KeyError: + # Put back the percent symbol + escaped_chunks[j] = '%' + item + except UnicodeDecodeError: + escaped_chunks[j] = unichr(int(item[:2], 16)) + item[2:] + unescaped = ''.join(escaped_chunks) + try: + decoded = unescaped.decode('utf-8') + except UnicodeDecodeError: + # If this path segment cannot be properly utf-8 decoded + # after doing unescaping we will just leave it alone + pass + else: + try: + decoded.encode(encoding) + except UnicodeEncodeError: + # If this chunk cannot be encoded in the local + # encoding, then we should leave it alone + pass + else: + # Otherwise take the url decoded one + res[i] = decoded + return u'/'.join(res) + + +def derive_to_location(from_location): + """Derive a TO_LOCATION given a FROM_LOCATION. + + The normal case is a FROM_LOCATION of http://foo/bar => bar. + The Right Thing for some logical destinations may differ though + because no / may be present at all. In that case, the result is + the full name without the scheme indicator, e.g. lp:foo-bar => foo-bar. + This latter case also applies when a Windows drive + is used without a path, e.g. c:foo-bar => foo-bar. + If no /, path separator or : is found, the from_location is returned. + """ + if from_location.find("/") >= 0 or from_location.find(os.sep) >= 0: + return os.path.basename(from_location.rstrip("/\\")) + else: + sep = from_location.find(":") + if sep > 0: + return from_location[sep+1:] + else: + return from_location + + +def _is_absolute(url): + return (osutils.pathjoin('/foo', url) == url) + + +def rebase_url(url, old_base, new_base): + """Convert a relative path from an old base URL to a new base URL. + + The result will be a relative path. + Absolute paths and full URLs are returned unaltered. + """ + scheme, separator = _find_scheme_and_separator(url) + if scheme is not None: + return url + if _is_absolute(url): + return url + old_parsed = urlparse.urlparse(old_base) + new_parsed = urlparse.urlparse(new_base) + if (old_parsed[:2]) != (new_parsed[:2]): + raise errors.InvalidRebaseURLs(old_base, new_base) + return determine_relative_path(new_parsed[2], + join(old_parsed[2], url)) + + +def determine_relative_path(from_path, to_path): + """Determine a relative path from from_path to to_path.""" + from_segments = osutils.splitpath(from_path) + to_segments = osutils.splitpath(to_path) + count = -1 + for count, (from_element, to_element) in enumerate(zip(from_segments, + to_segments)): + if from_element != to_element: + break + else: + count += 1 + unique_from = from_segments[count:] + unique_to = to_segments[count:] + segments = (['..'] * len(unique_from) + unique_to) + if len(segments) == 0: + return '.' + return osutils.pathjoin(*segments) + + +class URL(object): + """Parsed URL.""" + + def __init__(self, scheme, quoted_user, quoted_password, quoted_host, + port, quoted_path): + self.scheme = scheme + self.quoted_host = quoted_host + self.host = unquote(self.quoted_host) + self.quoted_user = quoted_user + if self.quoted_user is not None: + self.user = unquote(self.quoted_user) + else: + self.user = None + self.quoted_password = quoted_password + if self.quoted_password is not None: + self.password = unquote(self.quoted_password) + else: + self.password = None + self.port = port + self.quoted_path = _url_hex_escapes_re.sub(_unescape_safe_chars, quoted_path) + self.path = unquote(self.quoted_path) + + def __eq__(self, other): + return (isinstance(other, self.__class__) and + self.scheme == other.scheme and + self.host == other.host and + self.user == other.user and + self.password == other.password and + self.path == other.path) + + def __repr__(self): + return "<%s(%r, %r, %r, %r, %r, %r)>" % ( + self.__class__.__name__, + self.scheme, self.quoted_user, self.quoted_password, + self.quoted_host, self.port, self.quoted_path) + + @classmethod + def from_string(cls, url): + """Create a URL object from a string. + + :param url: URL as bytestring + """ + if isinstance(url, unicode): + raise errors.InvalidURL('should be ascii:\n%r' % url) + url = url.encode('utf-8') + (scheme, netloc, path, params, + query, fragment) = urlparse.urlparse(url, allow_fragments=False) + user = password = host = port = None + if '@' in netloc: + user, host = netloc.rsplit('@', 1) + if ':' in user: + user, password = user.split(':', 1) + else: + host = netloc + + if ':' in host and not (host[0] == '[' and host[-1] == ']'): + # there *is* port + host, port = host.rsplit(':',1) + try: + port = int(port) + except ValueError: + raise errors.InvalidURL('invalid port number %s in url:\n%s' % + (port, url)) + if host != "" and host[0] == '[' and host[-1] == ']': #IPv6 + host = host[1:-1] + + return cls(scheme, user, password, host, port, path) + + def __str__(self): + netloc = self.quoted_host + if ":" in netloc: + netloc = "[%s]" % netloc + if self.quoted_user is not None: + # Note that we don't put the password back even if we + # have one so that it doesn't get accidentally + # exposed. + netloc = '%s@%s' % (self.quoted_user, netloc) + if self.port is not None: + netloc = '%s:%d' % (netloc, self.port) + return urlparse.urlunparse( + (self.scheme, netloc, self.quoted_path, None, None, None)) + + @staticmethod + def _combine_paths(base_path, relpath): + """Transform a Transport-relative path to a remote absolute path. + + This does not handle substitution of ~ but does handle '..' and '.' + components. + + Examples:: + + t._combine_paths('/home/sarah', 'project/foo') + => '/home/sarah/project/foo' + t._combine_paths('/home/sarah', '../../etc') + => '/etc' + t._combine_paths('/home/sarah', '/etc') + => '/etc' + + :param base_path: base path + :param relpath: relative url string for relative part of remote path. + :return: urlencoded string for final path. + """ + if not isinstance(relpath, str): + raise errors.InvalidURL(relpath) + relpath = _url_hex_escapes_re.sub(_unescape_safe_chars, relpath) + if relpath.startswith('/'): + base_parts = [] + else: + base_parts = base_path.split('/') + if len(base_parts) > 0 and base_parts[-1] == '': + base_parts = base_parts[:-1] + for p in relpath.split('/'): + if p == '..': + if len(base_parts) == 0: + # In most filesystems, a request for the parent + # of root, just returns root. + continue + base_parts.pop() + elif p == '.': + continue # No-op + elif p != '': + base_parts.append(p) + path = '/'.join(base_parts) + if not path.startswith('/'): + path = '/' + path + return path + + def clone(self, offset=None): + """Return a new URL for a path relative to this URL. + + :param offset: A relative path, already urlencoded + :return: `URL` instance + """ + if offset is not None: + relative = unescape(offset).encode('utf-8') + path = self._combine_paths(self.path, relative) + path = quote(path, safe="/~") + else: + path = self.quoted_path + return self.__class__(self.scheme, self.quoted_user, + self.quoted_password, self.quoted_host, self.port, + path) + + +def parse_url(url): + """Extract the server address, the credentials and the path from the url. + + user, password, host and path should be quoted if they contain reserved + chars. + + :param url: an quoted url + :return: (scheme, user, password, host, port, path) tuple, all fields + are unquoted. + """ + parsed_url = URL.from_string(url) + return (parsed_url.scheme, parsed_url.user, parsed_url.password, + parsed_url.host, parsed_url.port, parsed_url.path) |