1 files changed, 106 insertions, 99 deletions
diff --git a/Lib/urllib2.py b/Lib/urllib2.py
index 4c83bfc780..ec01c8fd93 100644
--- a/Lib/urllib2.py
+++ b/Lib/urllib2.py
@@ -14,7 +14,7 @@ non-error returns.  The HTTPRedirectHandler automatically deals with
 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
 deals with digest authentication.
 
-urlopen(url, data=None) -- basic usage is that same as original
+urlopen(url, data=None) -- basic usage is the same as original
 urllib.  pass the url and optionally data to post to an HTTP URL, and
 get a file-like object back.  One difference is that you can also pass
 a Request instance instead of URL.  Raises a URLError (subclass of
@@ -77,16 +77,13 @@ f = urllib2.urlopen('http://www.python.org/')
 # the handler knows that the problem was, e.g., that it didn't know
 # that hash algo that requested in the challenge, it would be good to
 # pass that information along to the client, too.
-
-# XXX to do:
-# name!
-# documentation (getting there)
-# complex proxies
-# abstract factory for opener
 # ftp errors aren't handled cleanly
-# gopher can return a socket.error
 # check digest against correct (i.e. non-apache) implementation
 
+# Possible extensions:
+# complex proxies  XXX not sure what exactly was meant by this
+# abstract factory for opener
+
 import base64
 import ftplib
 import httplib
@@ -111,15 +108,15 @@ try:
 except ImportError:
     from StringIO import StringIO
 
-# not sure how many of these need to be gotten rid of
-from urllib import (unwrap, unquote, splittype, splithost,
+from urllib import (unwrap, unquote, splittype, splithost, quote,
      addinfourl, splitport, splitgophertype, splitquery,
      splitattr, ftpwrapper, noheaders, splituser, splitpasswd, splitvalue)
 
 # support for FileHandler, proxies via environment variables
 from urllib import localhost, url2pathname, getproxies
 
-__version__ = "2.4"
+# used in User-Agent header sent
+__version__ = sys.version[:3]
 
 _opener = None
 def urlopen(url, data=None):
@@ -330,8 +327,9 @@ class OpenerDirector:
         pass
 
     def _call_chain(self, chain, kind, meth_name, *args):
-        # XXX raise an exception if no one else should try to handle
-        # this url.  return None if you can't but someone else could.
+        # Handlers raise an exception if no one else should try to handle
+        # the request, or return None if they can't but another handler
+        # could.  Otherwise, they return the response.
         handlers = chain.get(kind, ())
         for handler in handlers:
             func = getattr(handler, meth_name)
@@ -507,6 +505,8 @@ class HTTPRedirectHandler(BaseHandler):
             # from the user (of urllib2, in this case).  In practice,
             # essentially all clients do redirect in this case, so we
             # do the same.
+            # be conciliant with URIs containing a space
+            newurl = newurl.replace(' ', '%20')
             return Request(newurl,
                            headers=req.headers,
                            origin_req_host=req.get_origin_req_host(),
@@ -561,6 +561,80 @@ class HTTPRedirectHandler(BaseHandler):
               "lead to an infinite loop.\n" \
               "The last 30x error message was:\n"
 
+
+def _parse_proxy(proxy):
+    """Return (scheme, user, password, host/port) given a URL or an authority.
+
+    If a URL is supplied, it must have an authority (host:port) component.
+    According to RFC 3986, having an authority component means the URL must
+    have two slashes after the scheme:
+
+    >>> _parse_proxy('file:/ftp.example.com/')
+    Traceback (most recent call last):
+    ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
+
+    The first three items of the returned tuple may be None.
+
+    Examples of authority parsing:
+
+    >>> _parse_proxy('proxy.example.com')
+    (None, None, None, 'proxy.example.com')
+    >>> _parse_proxy('proxy.example.com:3128')
+    (None, None, None, 'proxy.example.com:3128')
+
+    The authority component may optionally include userinfo (assumed to be
+    username:password):
+
+    >>> _parse_proxy('joe:password@proxy.example.com')
+    (None, 'joe', 'password', 'proxy.example.com')
+    >>> _parse_proxy('joe:password@proxy.example.com:3128')
+    (None, 'joe', 'password', 'proxy.example.com:3128')
+
+    Same examples, but with URLs instead:
+
+    >>> _parse_proxy('http://proxy.example.com/')
+    ('http', None, None, 'proxy.example.com')
+    >>> _parse_proxy('http://proxy.example.com:3128/')
+    ('http', None, None, 'proxy.example.com:3128')
+    >>> _parse_proxy('http://joe:password@proxy.example.com/')
+    ('http', 'joe', 'password', 'proxy.example.com')
+    >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
+    ('http', 'joe', 'password', 'proxy.example.com:3128')
+
+    Everything after the authority is ignored:
+
+    >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
+    ('ftp', 'joe', 'password', 'proxy.example.com')
+
+    Test for no trailing '/' case:
+
+    >>> _parse_proxy('http://joe:password@proxy.example.com')
+    ('http', 'joe', 'password', 'proxy.example.com')
+
+    """
+    from urlparse import _splitnetloc
+    scheme, r_scheme = splittype(proxy)
+    if not r_scheme.startswith("/"):
+        # authority
+        scheme = None
+        authority = proxy
+    else:
+        # URL
+        if not r_scheme.startswith("//"):
+            raise ValueError("proxy URL with no authority: %r" % proxy)
+        # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
+        # and 3.3.), path is empty or starts with '/'
+        end = r_scheme.find("/", 2)
+        if end == -1:
+            end = None
+        authority = r_scheme[2:end]
+    userinfo, hostport = splituser(authority)
+    if userinfo is not None:
+        user, password = splitpasswd(userinfo)
+    else:
+        user = password = None
+    return scheme, user, password, hostport
+
 class ProxyHandler(BaseHandler):
     # Proxies must be in front
     handler_order = 100
@@ -577,76 +651,27 @@ class ProxyHandler(BaseHandler):
 
     def proxy_open(self, req, proxy, type):
         orig_type = req.get_type()
-        type, r_type = splittype(proxy)
-        if not type or r_type.isdigit():
-            # proxy is specified without protocol
-            type = orig_type
-            host = proxy
-        else:
-            host, r_host = splithost(r_type)
-        user_pass, host = splituser(host)
-        user, password = splitpasswd(user_pass)
+        proxy_type, user, password, hostport = _parse_proxy(proxy)
+        if proxy_type is None:
+            proxy_type = orig_type
         if user and password:
-            user, password = user_pass.split(':', 1)
-            user_pass = base64.encodestring('%s:%s' % (unquote(user),
-                                            unquote(password))).strip()
-            req.add_header('Proxy-authorization', 'Basic ' + user_pass)
-        host = unquote(host)
-        req.set_proxy(host, type)
-        if orig_type == type:
+            user_pass = '%s:%s' % (unquote(user), unquote(password))
+            creds = base64.encodestring(user_pass).strip()
+            req.add_header('Proxy-authorization', 'Basic ' + creds)
+        hostport = unquote(hostport)
+        req.set_proxy(hostport, proxy_type)
+        if orig_type == proxy_type:
             # let other handlers take care of it
-            # XXX this only makes sense if the proxy is before the
-            # other handlers
             return None
         else:
             # need to start over, because the other handlers don't
             # grok the proxy's URL type
+            # e.g. if we have a constructor arg proxies like so:
+            # {'http': 'ftp://proxy.example.com'}, we may end up turning
+            # a request for http://acme.example.com/a into one for
+            # ftp://proxy.example.com/a
             return self.parent.open(req)
 
-# feature suggested by Duncan Booth
-# XXX custom is not a good name
-class CustomProxy:
-    # either pass a function to the constructor or override handle
-    def __init__(self, proto, func=None, proxy_addr=None):
-        self.proto = proto
-        self.func = func
-        self.addr = proxy_addr
-
-    def handle(self, req):
-        if self.func and self.func(req):
-            return 1
-
-    def get_proxy(self):
-        return self.addr
-
-class CustomProxyHandler(BaseHandler):
-    # Proxies must be in front
-    handler_order = 100
-
-    def __init__(self, *proxies):
-        self.proxies = {}
-
-    def proxy_open(self, req):
-        proto = req.get_type()
-        try:
-            proxies = self.proxies[proto]
-        except KeyError:
-            return None
-        for p in proxies:
-            if p.handle(req):
-                req.set_proxy(p.get_proxy())
-                return self.parent.open(req)
-        return None
-
-    def do_proxy(self, p, req):
-        return self.parent.open(req)
-
-    def add_proxy(self, cpo):
-        if cpo.proto in self.proxies:
-            self.proxies[cpo.proto].append(cpo)
-        else:
-            self.proxies[cpo.proto] = [cpo]
-
 class HTTPPasswordMgr:
     def __init__(self):
         self.passwd = {}
@@ -1128,8 +1153,11 @@ class FileHandler(BaseHandler):
     names = None
     def get_names(self):
         if FileHandler.names is None:
-            FileHandler.names = (socket.gethostbyname('localhost'),
-                                 socket.gethostbyname(socket.gethostname()))
+            try:
+                FileHandler.names = (socket.gethostbyname('localhost'),
+                                    socket.gethostbyname(socket.gethostname()))
+            except socket.gaierror:
+                FileHandler.names = (socket.gethostbyname('localhost'),)
         return FileHandler.names
 
     # not entirely sure what the rules are here
@@ -1258,6 +1286,7 @@ class CacheFTPHandler(FTPHandler):
 
 class GopherHandler(BaseHandler):
     def gopher_open(self, req):
+        # XXX can raise socket.error
         import gopherlib  # this raises DeprecationWarning in 2.5
         host = req.get_host()
         if not host:
@@ -1273,25 +1302,3 @@ class GopherHandler(BaseHandler):
         else:
             fp = gopherlib.send_selector(selector, host)
         return addinfourl(fp, noheaders(), req.get_full_url())
-
-#bleck! don't use this yet
-class OpenerFactory:
-
-    default_handlers = [UnknownHandler, HTTPHandler,
-                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
-                        FTPHandler, FileHandler]
-    handlers = []
-    replacement_handlers = []
-
-    def add_handler(self, h):
-        self.handlers = self.handlers + [h]
-
-    def replace_handler(self, h):
-        pass
-
-    def build_opener(self):
-        opener = OpenerDirector()
-        for ph in self.default_handlers:
-            if inspect.isclass(ph):
-                ph = ph()
-            opener.add_handler(ph)