diff options
author | Thomas Wouters <thomas@python.org> | 2006-04-21 10:40:58 +0000 |
---|---|---|
committer | Thomas Wouters <thomas@python.org> | 2006-04-21 10:40:58 +0000 |
commit | 66cac891ab67e9460620df27246918805f291990 (patch) | |
tree | 63521472f86287a79d85cfad1b44c4e97eb91c93 /Lib/urllib2.py | |
parent | 3e3958e8e08cb62877e8aac097ce9a95ae682124 (diff) | |
download | cpython-66cac891ab67e9460620df27246918805f291990.tar.gz |
Merge p3yk branch with the trunk up to revision 45595. This breaks a fair
number of tests, all because of the codecs/_multibytecodecs issue described
here (it's not a Py3K issue, just something Py3K discovers):
http://mail.python.org/pipermail/python-dev/2006-April/064051.html
Hye-Shik Chang promised to look for a fix, so no need to fix it here. The
tests that are expected to break are:
test_codecencodings_cn
test_codecencodings_hk
test_codecencodings_jp
test_codecencodings_kr
test_codecencodings_tw
test_codecs
test_multibytecodec
This merge fixes an actual test failure (test_weakref) in this branch,
though, so I believe merging is the right thing to do anyway.
Diffstat (limited to 'Lib/urllib2.py')
-rw-r--r-- | Lib/urllib2.py | 205 |
1 files changed, 106 insertions, 99 deletions
diff --git a/Lib/urllib2.py b/Lib/urllib2.py index 4c83bfc780..ec01c8fd93 100644 --- a/Lib/urllib2.py +++ b/Lib/urllib2.py @@ -14,7 +14,7 @@ non-error returns. The HTTPRedirectHandler automatically deals with HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler deals with digest authentication. -urlopen(url, data=None) -- basic usage is that same as original +urlopen(url, data=None) -- basic usage is the same as original urllib. pass the url and optionally data to post to an HTTP URL, and get a file-like object back. One difference is that you can also pass a Request instance instead of URL. Raises a URLError (subclass of @@ -77,16 +77,13 @@ f = urllib2.urlopen('http://www.python.org/') # the handler knows that the problem was, e.g., that it didn't know # that hash algo that requested in the challenge, it would be good to # pass that information along to the client, too. - -# XXX to do: -# name! -# documentation (getting there) -# complex proxies -# abstract factory for opener # ftp errors aren't handled cleanly -# gopher can return a socket.error # check digest against correct (i.e. non-apache) implementation +# Possible extensions: +# complex proxies XXX not sure what exactly was meant by this +# abstract factory for opener + import base64 import ftplib import httplib @@ -111,15 +108,15 @@ try: except ImportError: from StringIO import StringIO -# not sure how many of these need to be gotten rid of -from urllib import (unwrap, unquote, splittype, splithost, +from urllib import (unwrap, unquote, splittype, splithost, quote, addinfourl, splitport, splitgophertype, splitquery, splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue) # support for FileHandler, proxies via environment variables from urllib import localhost, url2pathname, getproxies -__version__ = "2.4" +# used in User-Agent header sent +__version__ = sys.version[:3] _opener = None def urlopen(url, data=None): @@ -330,8 +327,9 @@ class OpenerDirector: pass def _call_chain(self, chain, kind, meth_name, *args): - # XXX raise an exception if no one else should try to handle - # this url. return None if you can't but someone else could. + # Handlers raise an exception if no one else should try to handle + # the request, or return None if they can't but another handler + # could. Otherwise, they return the response. handlers = chain.get(kind, ()) for handler in handlers: func = getattr(handler, meth_name) @@ -507,6 +505,8 @@ class HTTPRedirectHandler(BaseHandler): # from the user (of urllib2, in this case). In practice, # essentially all clients do redirect in this case, so we # do the same. + # be conciliant with URIs containing a space + newurl = newurl.replace(' ', '%20') return Request(newurl, headers=req.headers, origin_req_host=req.get_origin_req_host(), @@ -561,6 +561,80 @@ class HTTPRedirectHandler(BaseHandler): "lead to an infinite loop.\n" \ "The last 30x error message was:\n" + +def _parse_proxy(proxy): + """Return (scheme, user, password, host/port) given a URL or an authority. + + If a URL is supplied, it must have an authority (host:port) component. + According to RFC 3986, having an authority component means the URL must + have two slashes after the scheme: + + >>> _parse_proxy('file:/ftp.example.com/') + Traceback (most recent call last): + ValueError: proxy URL with no authority: 'file:/ftp.example.com/' + + The first three items of the returned tuple may be None. + + Examples of authority parsing: + + >>> _parse_proxy('proxy.example.com') + (None, None, None, 'proxy.example.com') + >>> _parse_proxy('proxy.example.com:3128') + (None, None, None, 'proxy.example.com:3128') + + The authority component may optionally include userinfo (assumed to be + username:password): + + >>> _parse_proxy('joe:password@proxy.example.com') + (None, 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('joe:password@proxy.example.com:3128') + (None, 'joe', 'password', 'proxy.example.com:3128') + + Same examples, but with URLs instead: + + >>> _parse_proxy('http://proxy.example.com/') + ('http', None, None, 'proxy.example.com') + >>> _parse_proxy('http://proxy.example.com:3128/') + ('http', None, None, 'proxy.example.com:3128') + >>> _parse_proxy('http://joe:password@proxy.example.com/') + ('http', 'joe', 'password', 'proxy.example.com') + >>> _parse_proxy('http://joe:password@proxy.example.com:3128') + ('http', 'joe', 'password', 'proxy.example.com:3128') + + Everything after the authority is ignored: + + >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') + ('ftp', 'joe', 'password', 'proxy.example.com') + + Test for no trailing '/' case: + + >>> _parse_proxy('http://joe:password@proxy.example.com') + ('http', 'joe', 'password', 'proxy.example.com') + + """ + from urlparse import _splitnetloc + scheme, r_scheme = splittype(proxy) + if not r_scheme.startswith("/"): + # authority + scheme = None + authority = proxy + else: + # URL + if not r_scheme.startswith("//"): + raise ValueError("proxy URL with no authority: %r" % proxy) + # We have an authority, so for RFC 3986-compliant URLs (by ss 3. + # and 3.3.), path is empty or starts with '/' + end = r_scheme.find("/", 2) + if end == -1: + end = None + authority = r_scheme[2:end] + userinfo, hostport = splituser(authority) + if userinfo is not None: + user, password = splitpasswd(userinfo) + else: + user = password = None + return scheme, user, password, hostport + class ProxyHandler(BaseHandler): # Proxies must be in front handler_order = 100 @@ -577,76 +651,27 @@ class ProxyHandler(BaseHandler): def proxy_open(self, req, proxy, type): orig_type = req.get_type() - type, r_type = splittype(proxy) - if not type or r_type.isdigit(): - # proxy is specified without protocol - type = orig_type - host = proxy - else: - host, r_host = splithost(r_type) - user_pass, host = splituser(host) - user, password = splitpasswd(user_pass) + proxy_type, user, password, hostport = _parse_proxy(proxy) + if proxy_type is None: + proxy_type = orig_type if user and password: - user, password = user_pass.split(':', 1) - user_pass = base64.encodestring('%s:%s' % (unquote(user), - unquote(password))).strip() - req.add_header('Proxy-authorization', 'Basic ' + user_pass) - host = unquote(host) - req.set_proxy(host, type) - if orig_type == type: + user_pass = '%s:%s' % (unquote(user), unquote(password)) + creds = base64.encodestring(user_pass).strip() + req.add_header('Proxy-authorization', 'Basic ' + creds) + hostport = unquote(hostport) + req.set_proxy(hostport, proxy_type) + if orig_type == proxy_type: # let other handlers take care of it - # XXX this only makes sense if the proxy is before the - # other handlers return None else: # need to start over, because the other handlers don't # grok the proxy's URL type + # e.g. if we have a constructor arg proxies like so: + # {'http': 'ftp://proxy.example.com'}, we may end up turning + # a request for http://acme.example.com/a into one for + # ftp://proxy.example.com/a return self.parent.open(req) -# feature suggested by Duncan Booth -# XXX custom is not a good name -class CustomProxy: - # either pass a function to the constructor or override handle - def __init__(self, proto, func=None, proxy_addr=None): - self.proto = proto - self.func = func - self.addr = proxy_addr - - def handle(self, req): - if self.func and self.func(req): - return 1 - - def get_proxy(self): - return self.addr - -class CustomProxyHandler(BaseHandler): - # Proxies must be in front - handler_order = 100 - - def __init__(self, *proxies): - self.proxies = {} - - def proxy_open(self, req): - proto = req.get_type() - try: - proxies = self.proxies[proto] - except KeyError: - return None - for p in proxies: - if p.handle(req): - req.set_proxy(p.get_proxy()) - return self.parent.open(req) - return None - - def do_proxy(self, p, req): - return self.parent.open(req) - - def add_proxy(self, cpo): - if cpo.proto in self.proxies: - self.proxies[cpo.proto].append(cpo) - else: - self.proxies[cpo.proto] = [cpo] - class HTTPPasswordMgr: def __init__(self): self.passwd = {} @@ -1128,8 +1153,11 @@ class FileHandler(BaseHandler): names = None def get_names(self): if FileHandler.names is None: - FileHandler.names = (socket.gethostbyname('localhost'), - socket.gethostbyname(socket.gethostname())) + try: + FileHandler.names = (socket.gethostbyname('localhost'), + socket.gethostbyname(socket.gethostname())) + except socket.gaierror: + FileHandler.names = (socket.gethostbyname('localhost'),) return FileHandler.names # not entirely sure what the rules are here @@ -1258,6 +1286,7 @@ class CacheFTPHandler(FTPHandler): class GopherHandler(BaseHandler): def gopher_open(self, req): + # XXX can raise socket.error import gopherlib # this raises DeprecationWarning in 2.5 host = req.get_host() if not host: @@ -1273,25 +1302,3 @@ class GopherHandler(BaseHandler): else: fp = gopherlib.send_selector(selector, host) return addinfourl(fp, noheaders(), req.get_full_url()) - -#bleck! don't use this yet -class OpenerFactory: - - default_handlers = [UnknownHandler, HTTPHandler, - HTTPDefaultErrorHandler, HTTPRedirectHandler, - FTPHandler, FileHandler] - handlers = [] - replacement_handlers = [] - - def add_handler(self, h): - self.handlers = self.handlers + [h] - - def replace_handler(self, h): - pass - - def build_opener(self): - opener = OpenerDirector() - for ph in self.default_handlers: - if inspect.isclass(ph): - ph = ph() - opener.add_handler(ph) |