summaryrefslogtreecommitdiff
path: root/Lib/urllib
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/urllib')
-rw-r--r--Lib/urllib/parse.py70
-rw-r--r--Lib/urllib/request.py90
-rw-r--r--Lib/urllib/robotparser.py43
3 files changed, 184 insertions, 19 deletions
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
index 566fbf7188..958767a08d 100644
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -157,9 +157,8 @@ class _NetlocResultMixinBase(object):
port = self._hostinfo[1]
if port is not None:
port = int(port, 10)
- # Return None on an illegal port
if not ( 0 <= port <= 65535):
- return None
+ raise ValueError("Port out of range 0-65535")
return port
@@ -226,8 +225,71 @@ class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes):
from collections import namedtuple
_DefragResultBase = namedtuple('DefragResult', 'url fragment')
-_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
-_ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment')
+_SplitResultBase = namedtuple(
+ 'SplitResult', 'scheme netloc path query fragment')
+_ParseResultBase = namedtuple(
+ 'ParseResult', 'scheme netloc path params query fragment')
+
+_DefragResultBase.__doc__ = """
+DefragResult(url, fragment)
+
+A 2-tuple that contains the url without fragment identifier and the fragment
+identifier as a separate argument.
+"""
+
+_DefragResultBase.url.__doc__ = """The URL with no fragment identifier."""
+
+_DefragResultBase.fragment.__doc__ = """
+Fragment identifier separated from URL, that allows indirect identification of a
+secondary resource by reference to a primary resource and additional identifying
+information.
+"""
+
+_SplitResultBase.__doc__ = """
+SplitResult(scheme, netloc, path, query, fragment)
+
+A 5-tuple that contains the different components of a URL. Similar to
+ParseResult, but does not split params.
+"""
+
+_SplitResultBase.scheme.__doc__ = """Specifies URL scheme for the request."""
+
+_SplitResultBase.netloc.__doc__ = """
+Network location where the request is made to.
+"""
+
+_SplitResultBase.path.__doc__ = """
+The hierarchical path, such as the path to a file to download.
+"""
+
+_SplitResultBase.query.__doc__ = """
+The query component, that contains non-hierarchical data, that along with data
+in path component, identifies a resource in the scope of URI's scheme and
+network location.
+"""
+
+_SplitResultBase.fragment.__doc__ = """
+Fragment identifier, that allows indirect identification of a secondary resource
+by reference to a primary resource and additional identifying information.
+"""
+
+_ParseResultBase.__doc__ = """
+ParseResult(scheme, netloc, path, params, query, fragment)
+
+A 6-tuple that contains components of a parsed URL.
+"""
+
+_ParseResultBase.scheme.__doc__ = _SplitResultBase.scheme.__doc__
+_ParseResultBase.netloc.__doc__ = _SplitResultBase.netloc.__doc__
+_ParseResultBase.path.__doc__ = _SplitResultBase.path.__doc__
+_ParseResultBase.params.__doc__ = """
+Parameters for last path element used to dereference the URI in order to provide
+access to perform some operation on the resource.
+"""
+
+_ParseResultBase.query.__doc__ = _SplitResultBase.query.__doc__
+_ParseResultBase.fragment.__doc__ = _SplitResultBase.fragment.__doc__
+
# For backwards compatibility, alias _NetlocResultMixinStr
# ResultBase is no longer part of the documented API, but it is
diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index a46c689493..b6690c3d0e 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py
@@ -134,13 +134,73 @@ __all__ = [
]
# used in User-Agent header sent
-__version__ = sys.version[:3]
+__version__ = '%d.%d' % sys.version_info[:2]
_opener = None
def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
*, cafile=None, capath=None, cadefault=False, context=None):
+ '''Open the URL url, which can be either a string or a Request object.
+
+ *data* must be an object specifying additional data to be sent to
+ the server, or None if no such data is needed. See Request for
+ details.
+
+ urllib.request module uses HTTP/1.1 and includes a "Connection:close"
+ header in its HTTP requests.
+
+ The optional *timeout* parameter specifies a timeout in seconds for
+ blocking operations like the connection attempt (if not specified, the
+ global default timeout setting will be used). This only works for HTTP,
+ HTTPS and FTP connections.
+
+ If *context* is specified, it must be a ssl.SSLContext instance describing
+ the various SSL options. See HTTPSConnection for more details.
+
+ The optional *cafile* and *capath* parameters specify a set of trusted CA
+ certificates for HTTPS requests. cafile should point to a single file
+ containing a bundle of CA certificates, whereas capath should point to a
+ directory of hashed certificate files. More information can be found in
+ ssl.SSLContext.load_verify_locations().
+
+ The *cadefault* parameter is ignored.
+
+ This function always returns an object which can work as a context
+ manager and has methods such as
+
+ * geturl() - return the URL of the resource retrieved, commonly used to
+ determine if a redirect was followed
+
+ * info() - return the meta-information of the page, such as headers, in the
+ form of an email.message_from_string() instance (see Quick Reference to
+ HTTP Headers)
+
+ * getcode() - return the HTTP status code of the response. Raises URLError
+ on errors.
+
+ For HTTP and HTTPS URLs, this function returns a http.client.HTTPResponse
+ object slightly modified. In addition to the three new methods above, the
+ msg attribute contains the same information as the reason attribute ---
+ the reason phrase returned by the server --- instead of the response
+ headers as it is specified in the documentation for HTTPResponse.
+
+ For FTP, file, and data URLs and requests explicitly handled by legacy
+ URLopener and FancyURLopener classes, this function returns a
+ urllib.response.addinfourl object.
+
+ Note that None may be returned if no handler handles the request (though
+ the default installed global OpenerDirector uses UnknownHandler to ensure
+ this never happens).
+
+ In addition, if proxy settings are detected (for example, when a *_proxy
+ environment variable like http_proxy is set), ProxyHandler is default
+ installed and makes sure the requests are handled through the proxy.
+
+ '''
global _opener
if cafile or capath or cadefault:
+ import warnings
+ warnings.warn("cafile, cpath and cadefault are deprecated, use a "
+ "custom context instead.", DeprecationWarning, 2)
if context is not None:
raise ValueError(
"You can't pass both context and any of cafile, capath, and "
@@ -1170,6 +1230,11 @@ class AbstractHTTPHandler(BaseHandler):
def set_http_debuglevel(self, level):
self._debuglevel = level
+ def _get_content_length(self, request):
+ return http.client.HTTPConnection._get_content_length(
+ request.data,
+ request.get_method())
+
def do_request_(self, request):
host = request.host
if not host:
@@ -1178,24 +1243,22 @@ class AbstractHTTPHandler(BaseHandler):
if request.data is not None: # POST
data = request.data
if isinstance(data, str):
- msg = "POST data should be bytes or an iterable of bytes. " \
- "It cannot be of type str."
+ msg = "POST data should be bytes, an iterable of bytes, " \
+ "or a file object. It cannot be of type str."
raise TypeError(msg)
if not request.has_header('Content-type'):
request.add_unredirected_header(
'Content-type',
'application/x-www-form-urlencoded')
- if not request.has_header('Content-length'):
- try:
- mv = memoryview(data)
- except TypeError:
- if isinstance(data, collections.Iterable):
- raise ValueError("Content-Length should be specified "
- "for iterable data of type %r %r" % (type(data),
- data))
+ if (not request.has_header('Content-length')
+ and not request.has_header('Transfer-encoding')):
+ content_length = self._get_content_length(request)
+ if content_length is not None:
+ request.add_unredirected_header(
+ 'Content-length', str(content_length))
else:
request.add_unredirected_header(
- 'Content-length', '%d' % (len(mv) * mv.itemsize))
+ 'Transfer-encoding', 'chunked')
sel_host = host
if request.has_proxy():
@@ -1251,7 +1314,8 @@ class AbstractHTTPHandler(BaseHandler):
try:
try:
- h.request(req.get_method(), req.selector, req.data, headers)
+ h.request(req.get_method(), req.selector, req.data, headers,
+ encode_chunked=req.has_header('Transfer-encoding'))
except OSError as err: # timeout error
raise URLError(err)
r = h.getresponse()
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 8b69fd985e..9dab4c1c3a 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -10,7 +10,9 @@
http://www.robotstxt.org/norobots-rfc.txt
"""
-import urllib.parse, urllib.request
+import collections
+import urllib.parse
+import urllib.request
__all__ = ["RobotFileParser"]
@@ -120,10 +122,29 @@ class RobotFileParser:
if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
+ elif line[0] == "crawl-delay":
+ if state != 0:
+ # before trying to convert to int we need to make
+ # sure that robots.txt has valid syntax otherwise
+ # it will crash
+ if line[1].strip().isdigit():
+ entry.delay = int(line[1])
+ state = 2
+ elif line[0] == "request-rate":
+ if state != 0:
+ numbers = line[1].split('/')
+ # check if all values are sane
+ if (len(numbers) == 2 and numbers[0].strip().isdigit()
+ and numbers[1].strip().isdigit()):
+ req_rate = collections.namedtuple('req_rate',
+ 'requests seconds')
+ entry.req_rate = req_rate
+ entry.req_rate.requests = int(numbers[0])
+ entry.req_rate.seconds = int(numbers[1])
+ state = 2
if state == 2:
self._add_entry(entry)
-
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
if self.disallow_all:
@@ -153,6 +174,22 @@ class RobotFileParser:
# agent not found ==> access granted
return True
+ def crawl_delay(self, useragent):
+ if not self.mtime():
+ return None
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.delay
+ return self.default_entry.delay
+
+ def request_rate(self, useragent):
+ if not self.mtime():
+ return None
+ for entry in self.entries:
+ if entry.applies_to(useragent):
+ return entry.req_rate
+ return self.default_entry.req_rate
+
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
@@ -180,6 +217,8 @@ class Entry:
def __init__(self):
self.useragents = []
self.rulelines = []
+ self.delay = None
+ self.req_rate = None
def __str__(self):
ret = []