summaryrefslogtreecommitdiff
path: root/Lib/urllib2.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/urllib2.py')
-rw-r--r--Lib/urllib2.py205
1 files changed, 106 insertions, 99 deletions
diff --git a/Lib/urllib2.py b/Lib/urllib2.py
index 4c83bfc780..ec01c8fd93 100644
--- a/Lib/urllib2.py
+++ b/Lib/urllib2.py
@@ -14,7 +14,7 @@ non-error returns. The HTTPRedirectHandler automatically deals with
HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
deals with digest authentication.
-urlopen(url, data=None) -- basic usage is that same as original
+urlopen(url, data=None) -- basic usage is the same as original
urllib. pass the url and optionally data to post to an HTTP URL, and
get a file-like object back. One difference is that you can also pass
a Request instance instead of URL. Raises a URLError (subclass of
@@ -77,16 +77,13 @@ f = urllib2.urlopen('http://www.python.org/')
# the handler knows that the problem was, e.g., that it didn't know
# that hash algo that requested in the challenge, it would be good to
# pass that information along to the client, too.
-
-# XXX to do:
-# name!
-# documentation (getting there)
-# complex proxies
-# abstract factory for opener
# ftp errors aren't handled cleanly
-# gopher can return a socket.error
# check digest against correct (i.e. non-apache) implementation
+# Possible extensions:
+# complex proxies XXX not sure what exactly was meant by this
+# abstract factory for opener
+
import base64
import ftplib
import httplib
@@ -111,15 +108,15 @@ try:
except ImportError:
from StringIO import StringIO
-# not sure how many of these need to be gotten rid of
-from urllib import (unwrap, unquote, splittype, splithost,
+from urllib import (unwrap, unquote, splittype, splithost, quote,
addinfourl, splitport, splitgophertype, splitquery,
splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
# support for FileHandler, proxies via environment variables
from urllib import localhost, url2pathname, getproxies
-__version__ = "2.4"
+# used in User-Agent header sent
+__version__ = sys.version[:3]
_opener = None
def urlopen(url, data=None):
@@ -330,8 +327,9 @@ class OpenerDirector:
pass
def _call_chain(self, chain, kind, meth_name, *args):
- # XXX raise an exception if no one else should try to handle
- # this url. return None if you can't but someone else could.
+ # Handlers raise an exception if no one else should try to handle
+ # the request, or return None if they can't but another handler
+ # could. Otherwise, they return the response.
handlers = chain.get(kind, ())
for handler in handlers:
func = getattr(handler, meth_name)
@@ -507,6 +505,8 @@ class HTTPRedirectHandler(BaseHandler):
# from the user (of urllib2, in this case). In practice,
# essentially all clients do redirect in this case, so we
# do the same.
+ # be conciliant with URIs containing a space
+ newurl = newurl.replace(' ', '%20')
return Request(newurl,
headers=req.headers,
origin_req_host=req.get_origin_req_host(),
@@ -561,6 +561,80 @@ class HTTPRedirectHandler(BaseHandler):
"lead to an infinite loop.\n" \
"The last 30x error message was:\n"
+
+def _parse_proxy(proxy):
+ """Return (scheme, user, password, host/port) given a URL or an authority.
+
+ If a URL is supplied, it must have an authority (host:port) component.
+ According to RFC 3986, having an authority component means the URL must
+ have two slashes after the scheme:
+
+ >>> _parse_proxy('file:/ftp.example.com/')
+ Traceback (most recent call last):
+ ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
+
+ The first three items of the returned tuple may be None.
+
+ Examples of authority parsing:
+
+ >>> _parse_proxy('proxy.example.com')
+ (None, None, None, 'proxy.example.com')
+ >>> _parse_proxy('proxy.example.com:3128')
+ (None, None, None, 'proxy.example.com:3128')
+
+ The authority component may optionally include userinfo (assumed to be
+ username:password):
+
+ >>> _parse_proxy('joe:password@proxy.example.com')
+ (None, 'joe', 'password', 'proxy.example.com')
+ >>> _parse_proxy('joe:password@proxy.example.com:3128')
+ (None, 'joe', 'password', 'proxy.example.com:3128')
+
+ Same examples, but with URLs instead:
+
+ >>> _parse_proxy('http://proxy.example.com/')
+ ('http', None, None, 'proxy.example.com')
+ >>> _parse_proxy('http://proxy.example.com:3128/')
+ ('http', None, None, 'proxy.example.com:3128')
+ >>> _parse_proxy('http://joe:password@proxy.example.com/')
+ ('http', 'joe', 'password', 'proxy.example.com')
+ >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
+ ('http', 'joe', 'password', 'proxy.example.com:3128')
+
+ Everything after the authority is ignored:
+
+ >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
+ ('ftp', 'joe', 'password', 'proxy.example.com')
+
+ Test for no trailing '/' case:
+
+ >>> _parse_proxy('http://joe:password@proxy.example.com')
+ ('http', 'joe', 'password', 'proxy.example.com')
+
+ """
+ from urlparse import _splitnetloc
+ scheme, r_scheme = splittype(proxy)
+ if not r_scheme.startswith("/"):
+ # authority
+ scheme = None
+ authority = proxy
+ else:
+ # URL
+ if not r_scheme.startswith("//"):
+ raise ValueError("proxy URL with no authority: %r" % proxy)
+ # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
+ # and 3.3.), path is empty or starts with '/'
+ end = r_scheme.find("/", 2)
+ if end == -1:
+ end = None
+ authority = r_scheme[2:end]
+ userinfo, hostport = splituser(authority)
+ if userinfo is not None:
+ user, password = splitpasswd(userinfo)
+ else:
+ user = password = None
+ return scheme, user, password, hostport
+
class ProxyHandler(BaseHandler):
# Proxies must be in front
handler_order = 100
@@ -577,76 +651,27 @@ class ProxyHandler(BaseHandler):
def proxy_open(self, req, proxy, type):
orig_type = req.get_type()
- type, r_type = splittype(proxy)
- if not type or r_type.isdigit():
- # proxy is specified without protocol
- type = orig_type
- host = proxy
- else:
- host, r_host = splithost(r_type)
- user_pass, host = splituser(host)
- user, password = splitpasswd(user_pass)
+ proxy_type, user, password, hostport = _parse_proxy(proxy)
+ if proxy_type is None:
+ proxy_type = orig_type
if user and password:
- user, password = user_pass.split(':', 1)
- user_pass = base64.encodestring('%s:%s' % (unquote(user),
- unquote(password))).strip()
- req.add_header('Proxy-authorization', 'Basic ' + user_pass)
- host = unquote(host)
- req.set_proxy(host, type)
- if orig_type == type:
+ user_pass = '%s:%s' % (unquote(user), unquote(password))
+ creds = base64.encodestring(user_pass).strip()
+ req.add_header('Proxy-authorization', 'Basic ' + creds)
+ hostport = unquote(hostport)
+ req.set_proxy(hostport, proxy_type)
+ if orig_type == proxy_type:
# let other handlers take care of it
- # XXX this only makes sense if the proxy is before the
- # other handlers
return None
else:
# need to start over, because the other handlers don't
# grok the proxy's URL type
+ # e.g. if we have a constructor arg proxies like so:
+ # {'http': 'ftp://proxy.example.com'}, we may end up turning
+ # a request for http://acme.example.com/a into one for
+ # ftp://proxy.example.com/a
return self.parent.open(req)
-# feature suggested by Duncan Booth
-# XXX custom is not a good name
-class CustomProxy:
- # either pass a function to the constructor or override handle
- def __init__(self, proto, func=None, proxy_addr=None):
- self.proto = proto
- self.func = func
- self.addr = proxy_addr
-
- def handle(self, req):
- if self.func and self.func(req):
- return 1
-
- def get_proxy(self):
- return self.addr
-
-class CustomProxyHandler(BaseHandler):
- # Proxies must be in front
- handler_order = 100
-
- def __init__(self, *proxies):
- self.proxies = {}
-
- def proxy_open(self, req):
- proto = req.get_type()
- try:
- proxies = self.proxies[proto]
- except KeyError:
- return None
- for p in proxies:
- if p.handle(req):
- req.set_proxy(p.get_proxy())
- return self.parent.open(req)
- return None
-
- def do_proxy(self, p, req):
- return self.parent.open(req)
-
- def add_proxy(self, cpo):
- if cpo.proto in self.proxies:
- self.proxies[cpo.proto].append(cpo)
- else:
- self.proxies[cpo.proto] = [cpo]
-
class HTTPPasswordMgr:
def __init__(self):
self.passwd = {}
@@ -1128,8 +1153,11 @@ class FileHandler(BaseHandler):
names = None
def get_names(self):
if FileHandler.names is None:
- FileHandler.names = (socket.gethostbyname('localhost'),
- socket.gethostbyname(socket.gethostname()))
+ try:
+ FileHandler.names = (socket.gethostbyname('localhost'),
+ socket.gethostbyname(socket.gethostname()))
+ except socket.gaierror:
+ FileHandler.names = (socket.gethostbyname('localhost'),)
return FileHandler.names
# not entirely sure what the rules are here
@@ -1258,6 +1286,7 @@ class CacheFTPHandler(FTPHandler):
class GopherHandler(BaseHandler):
def gopher_open(self, req):
+ # XXX can raise socket.error
import gopherlib # this raises DeprecationWarning in 2.5
host = req.get_host()
if not host:
@@ -1273,25 +1302,3 @@ class GopherHandler(BaseHandler):
else:
fp = gopherlib.send_selector(selector, host)
return addinfourl(fp, noheaders(), req.get_full_url())
-
-#bleck! don't use this yet
-class OpenerFactory:
-
- default_handlers = [UnknownHandler, HTTPHandler,
- HTTPDefaultErrorHandler, HTTPRedirectHandler,
- FTPHandler, FileHandler]
- handlers = []
- replacement_handlers = []
-
- def add_handler(self, h):
- self.handlers = self.handlers + [h]
-
- def replace_handler(self, h):
- pass
-
- def build_opener(self):
- opener = OpenerDirector()
- for ph in self.default_handlers:
- if inspect.isclass(ph):
- ph = ph()
- opener.add_handler(ph)