diff options
Diffstat (limited to 'bzrlib/transport/http/_pycurl.py')
-rw-r--r-- | bzrlib/transport/http/_pycurl.py | 448 |
1 files changed, 448 insertions, 0 deletions
diff --git a/bzrlib/transport/http/_pycurl.py b/bzrlib/transport/http/_pycurl.py new file mode 100644 index 0000000..612123d --- /dev/null +++ b/bzrlib/transport/http/_pycurl.py @@ -0,0 +1,448 @@ +# Copyright (C) 2006-2010 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +"""http/https transport using pycurl""" + +from __future__ import absolute_import + +# TODO: test reporting of http errors +# +# TODO: Transport option to control caching of particular requests; broadly we +# would want to offer "caching allowed" or "must revalidate", depending on +# whether we expect a particular file will be modified after it's committed. +# It's probably safer to just always revalidate. mbp 20060321 + +# TODO: Some refactoring could be done to avoid the strange idiom +# used to capture data and headers while setting up the request +# (and having to pass 'header' to _curl_perform to handle +# redirections) . This could be achieved by creating a +# specialized Curl object and returning code, headers and data +# from _curl_perform. Not done because we may deprecate pycurl in the +# future -- vila 20070212 + +from cStringIO import StringIO +import httplib + +import bzrlib +from bzrlib import ( + debug, + errors, + trace, + ) +from bzrlib.transport.http import ( + ca_bundle, + HttpTransportBase, + response, + unhtml_roughly, + ) + +try: + import pycurl +except ImportError, e: + trace.mutter("failed to import pycurl: %s", e) + raise errors.DependencyNotPresent('pycurl', e) + +try: + # see if we can actually initialize PyCurl - sometimes it will load but + # fail to start up due to this bug: + # + # 32. (At least on Windows) If libcurl is built with c-ares and there's + # no DNS server configured in the system, the ares_init() call fails and + # thus curl_easy_init() fails as well. This causes weird effects for + # people who use numerical IP addresses only. + # + # reported by Alexander Belchenko, 2006-04-26 + pycurl.Curl() +except pycurl.error, e: + trace.mutter("failed to initialize pycurl: %s", e) + raise errors.DependencyNotPresent('pycurl', e) + + + + +def _get_pycurl_errcode(symbol, default): + """ + Returns the numerical error code for a symbol defined by pycurl. + + Different pycurl implementations define different symbols for error + codes. Old versions never define some symbols (wether they can return the + corresponding error code or not). The following addresses the problem by + defining the symbols we care about. Note: this allows to define symbols + for errors that older versions will never return, which is fine. + """ + return pycurl.__dict__.get(symbol, default) + +CURLE_COULDNT_CONNECT = _get_pycurl_errcode('E_COULDNT_CONNECT', 7) +CURLE_COULDNT_RESOLVE_HOST = _get_pycurl_errcode('E_COULDNT_RESOLVE_HOST', 6) +CURLE_COULDNT_RESOLVE_PROXY = _get_pycurl_errcode('E_COULDNT_RESOLVE_PROXY', 5) +CURLE_GOT_NOTHING = _get_pycurl_errcode('E_GOT_NOTHING', 52) +CURLE_PARTIAL_FILE = _get_pycurl_errcode('E_PARTIAL_FILE', 18) +CURLE_SEND_ERROR = _get_pycurl_errcode('E_SEND_ERROR', 55) +CURLE_RECV_ERROR = _get_pycurl_errcode('E_RECV_ERROR', 56) +CURLE_SSL_CACERT = _get_pycurl_errcode('E_SSL_CACERT', 60) +CURLE_SSL_CACERT_BADFILE = _get_pycurl_errcode('E_SSL_CACERT_BADFILE', 77) + + +class PyCurlTransport(HttpTransportBase): + """http client transport using pycurl + + PyCurl is a Python binding to the C "curl" multiprotocol client. + + This transport can be significantly faster than the builtin + Python client. Advantages include: DNS caching. + """ + + def __init__(self, base, _from_transport=None): + super(PyCurlTransport, self).__init__(base, 'pycurl', + _from_transport=_from_transport) + if self._unqualified_scheme == 'https': + # Check availability of https into pycurl supported + # protocols + supported = pycurl.version_info()[8] + if 'https' not in supported: + raise errors.DependencyNotPresent('pycurl', 'no https support') + self.cabundle = ca_bundle.get_ca_path() + + def _get_curl(self): + connection = self._get_connection() + if connection is None: + # First connection ever. There is no credentials for pycurl, either + # the password was embedded in the URL or it's not needed. The + # connection for pycurl is just the Curl object, it will not + # connect to the http server until the first request (which had + # just called us). + connection = pycurl.Curl() + # First request, initialize credentials. + auth = self._create_auth() + # Proxy handling is out of reach, so we punt + self._set_connection(connection, auth) + return connection + + def disconnect(self): + connection = self._get_connection() + if connection is not None: + connection.close() + + def has(self, relpath): + """See Transport.has()""" + # We set NO BODY=0 in _get_full, so it should be safe + # to re-use the non-range curl object + curl = self._get_curl() + abspath = self._remote_path(relpath) + curl.setopt(pycurl.URL, abspath) + self._set_curl_options(curl) + curl.setopt(pycurl.HTTPGET, 1) + # don't want the body - ie just do a HEAD request + # This means "NO BODY" not 'nobody' + curl.setopt(pycurl.NOBODY, 1) + # But we need headers to handle redirections + header = StringIO() + curl.setopt(pycurl.HEADERFUNCTION, header.write) + # In some erroneous cases, pycurl will emit text on + # stdout if we don't catch it (see InvalidStatus tests + # for one such occurrence). + blackhole = StringIO() + curl.setopt(pycurl.WRITEFUNCTION, blackhole.write) + self._curl_perform(curl, header) + code = curl.getinfo(pycurl.HTTP_CODE) + if code == 404: # not found + return False + elif code == 200: # "ok" + return True + else: + self._raise_curl_http_error(curl) + + def _get(self, relpath, offsets, tail_amount=0): + # This just switches based on the type of request + if offsets is not None or tail_amount not in (0, None): + return self._get_ranged(relpath, offsets, tail_amount=tail_amount) + else: + return self._get_full(relpath) + + def _setup_get_request(self, curl, relpath): + # Make sure we do a GET request. versions > 7.14.1 also set the + # NO BODY flag, but we'll do it ourselves in case it is an older + # pycurl version + curl.setopt(pycurl.NOBODY, 0) + curl.setopt(pycurl.HTTPGET, 1) + return self._setup_request(curl, relpath) + + def _setup_request(self, curl, relpath): + """Do the common setup stuff for making a request + + :param curl: The curl object to place the request on + :param relpath: The relative path that we want to get + :return: (abspath, data, header) + abspath: full url + data: file that will be filled with the body + header: file that will be filled with the headers + """ + abspath = self._remote_path(relpath) + curl.setopt(pycurl.URL, abspath) + self._set_curl_options(curl) + + data = StringIO() + header = StringIO() + curl.setopt(pycurl.WRITEFUNCTION, data.write) + curl.setopt(pycurl.HEADERFUNCTION, header.write) + + return abspath, data, header + + def _get_full(self, relpath): + """Make a request for the entire file""" + curl = self._get_curl() + abspath, data, header = self._setup_get_request(curl, relpath) + self._curl_perform(curl, header) + + code = curl.getinfo(pycurl.HTTP_CODE) + data.seek(0) + + if code == 404: + raise errors.NoSuchFile(abspath) + if code != 200: + self._raise_curl_http_error( + curl, 'expected 200 or 404 for full response.') + + return code, data + + # The parent class use 0 to minimize the requests, but since we can't + # exploit the results as soon as they are received (pycurl limitation) we'd + # better issue more requests and provide a more responsive UI incurring + # more latency costs. + # If you modify this, think about modifying the comment in http/__init__.py + # too. + _get_max_size = 4 * 1024 * 1024 + + def _get_ranged(self, relpath, offsets, tail_amount): + """Make a request for just part of the file.""" + curl = self._get_curl() + abspath, data, header = self._setup_get_request(curl, relpath) + + range_header = self._attempted_range_header(offsets, tail_amount) + if range_header is None: + # Forget ranges, the server can't handle them + return self._get_full(relpath) + + self._curl_perform(curl, header, ['Range: bytes=%s' % range_header]) + data.seek(0) + + code = curl.getinfo(pycurl.HTTP_CODE) + + if code == 404: # not found + raise errors.NoSuchFile(abspath) + elif code in (400, 416): + # We don't know which, but one of the ranges we specified was + # wrong. + raise errors.InvalidHttpRange(abspath, range_header, + 'Server return code %d' + % curl.getinfo(pycurl.HTTP_CODE)) + msg = self._parse_headers(header) + return code, response.handle_response(abspath, code, msg, data) + + def _parse_headers(self, status_and_headers): + """Transform the headers provided by curl into an HTTPMessage""" + status_and_headers.seek(0) + # Ignore status line + status_and_headers.readline() + msg = httplib.HTTPMessage(status_and_headers) + return msg + + def _post(self, body_bytes): + curl = self._get_curl() + abspath, data, header = self._setup_request(curl, '.bzr/smart') + curl.setopt(pycurl.POST, 1) + fake_file = StringIO(body_bytes) + curl.setopt(pycurl.POSTFIELDSIZE, len(body_bytes)) + curl.setopt(pycurl.READFUNCTION, fake_file.read) + # We override the Expect: header so that pycurl will send the POST + # body immediately. + try: + self._curl_perform(curl, header, + ['Expect: ', + 'Content-Type: application/octet-stream']) + except pycurl.error, e: + if e[0] == CURLE_SEND_ERROR: + # When talking to an HTTP/1.0 server, getting a 400+ error code + # triggers a bug in some combinations of curl/kernel in rare + # occurrences. Basically, the server closes the connection + # after sending the error but the client (having received and + # parsed the response) still try to send the request body (see + # bug #225020 and its upstream associated bug). Since the + # error code and the headers are known to be available, we just + # swallow the exception, leaving the upper levels handle the + # 400+ error. + trace.mutter('got pycurl error in POST: %s, %s, %s, url: %s ', + e[0], e[1], e, abspath) + else: + # Re-raise otherwise + raise + data.seek(0) + code = curl.getinfo(pycurl.HTTP_CODE) + msg = self._parse_headers(header) + return code, response.handle_response(abspath, code, msg, data) + + + def _raise_curl_http_error(self, curl, info=None, body=None): + """Common curl->bzrlib error translation. + + Some methods may choose to override this for particular cases. + + The URL and code are automatically included as appropriate. + + :param info: Extra information to include in the message. + + :param body: File-like object from which the body of the page can be + read. + """ + code = curl.getinfo(pycurl.HTTP_CODE) + url = curl.getinfo(pycurl.EFFECTIVE_URL) + if body is not None: + response_body = body.read() + plaintext_body = unhtml_roughly(response_body) + else: + response_body = None + plaintext_body = '' + if code == 403: + raise errors.TransportError( + 'Server refuses to fulfill the request (403 Forbidden)' + ' for %s: %s' % (url, plaintext_body)) + else: + if info is None: + msg = '' + else: + msg = ': ' + info + raise errors.InvalidHttpResponse( + url, 'Unable to handle http code %d%s: %s' + % (code, msg, plaintext_body)) + + def _debug_cb(self, kind, text): + if kind in (pycurl.INFOTYPE_HEADER_IN, pycurl.INFOTYPE_DATA_IN): + self._report_activity(len(text), 'read') + if (kind == pycurl.INFOTYPE_HEADER_IN + and 'http' in debug.debug_flags): + trace.mutter('< %s' % (text.rstrip(),)) + elif kind in (pycurl.INFOTYPE_HEADER_OUT, pycurl.INFOTYPE_DATA_OUT): + self._report_activity(len(text), 'write') + if (kind == pycurl.INFOTYPE_HEADER_OUT + and 'http' in debug.debug_flags): + lines = [] + for line in text.rstrip().splitlines(): + # People are often told to paste -Dhttp output to help + # debug. Don't compromise credentials. + try: + header, details = line.split(':', 1) + except ValueError: + header = None + if header in ('Authorization', 'Proxy-Authorization'): + line = '%s: <masked>' % (header,) + lines.append(line) + trace.mutter('> ' + '\n> '.join(lines)) + elif kind == pycurl.INFOTYPE_TEXT and 'http' in debug.debug_flags: + trace.mutter('* %s' % text.rstrip()) + elif (kind in (pycurl.INFOTYPE_TEXT, pycurl.INFOTYPE_SSL_DATA_IN, + pycurl.INFOTYPE_SSL_DATA_OUT) + and 'http' in debug.debug_flags): + trace.mutter('* %s' % text) + + def _set_curl_options(self, curl): + """Set options for all requests""" + ua_str = 'bzr/%s (pycurl: %s)' % (bzrlib.__version__, pycurl.version) + curl.setopt(pycurl.USERAGENT, ua_str) + curl.setopt(pycurl.VERBOSE, 1) + curl.setopt(pycurl.DEBUGFUNCTION, self._debug_cb) + if self.cabundle: + curl.setopt(pycurl.CAINFO, self.cabundle) + # Set accepted auth methods + curl.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY) + curl.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY) + auth = self._get_credentials() + user = auth.get('user', None) + password = auth.get('password', None) + userpass = None + if user is not None: + userpass = user + ':' + if password is not None: # '' is a valid password + userpass += password + curl.setopt(pycurl.USERPWD, userpass) + + def _curl_perform(self, curl, header, more_headers=[]): + """Perform curl operation and translate exceptions.""" + try: + # There's no way in http/1.0 to say "must + # revalidate"; we don't want to force it to always + # retrieve. so just turn off the default Pragma + # provided by Curl. + headers = ['Cache-control: max-age=0', + 'Pragma: no-cache', + 'Connection: Keep-Alive'] + curl.setopt(pycurl.HTTPHEADER, headers + more_headers) + curl.perform() + except pycurl.error, e: + url = curl.getinfo(pycurl.EFFECTIVE_URL) + trace.mutter('got pycurl error: %s, %s, %s, url: %s ', + e[0], e[1], e, url) + if e[0] in (CURLE_COULDNT_RESOLVE_HOST, + CURLE_COULDNT_RESOLVE_PROXY, + CURLE_COULDNT_CONNECT, + CURLE_GOT_NOTHING, + CURLE_SSL_CACERT, + CURLE_SSL_CACERT_BADFILE, + ): + raise errors.ConnectionError( + 'curl connection error (%s)\non %s' % (e[1], url)) + elif e[0] == CURLE_RECV_ERROR: + raise errors.ConnectionReset( + 'curl connection error (%s)\non %s' % (e[1], url)) + elif e[0] == CURLE_PARTIAL_FILE: + # Pycurl itself has detected a short read. We do not have all + # the information for the ShortReadvError, but that should be + # enough + raise errors.ShortReadvError(url, + offset='unknown', length='unknown', + actual='unknown', + extra='Server aborted the request') + raise + code = curl.getinfo(pycurl.HTTP_CODE) + if code in (301, 302, 303, 307): + url = curl.getinfo(pycurl.EFFECTIVE_URL) + msg = self._parse_headers(header) + redirected_to = msg.getheader('location') + raise errors.RedirectRequested(url, + redirected_to, + is_permanent=(code == 301)) + + +def get_test_permutations(): + """Return the permutations to be used in testing.""" + from bzrlib.tests import features + from bzrlib.tests import http_server + permutations = [(PyCurlTransport, http_server.HttpServer_PyCurl),] + if features.HTTPSServerFeature.available(): + from bzrlib.tests import ( + https_server, + ssl_certs, + ) + + class HTTPS_pycurl_transport(PyCurlTransport): + + def __init__(self, base, _from_transport=None): + super(HTTPS_pycurl_transport, self).__init__(base, + _from_transport) + self.cabundle = str(ssl_certs.build_path('ca.crt')) + + permutations.append((HTTPS_pycurl_transport, + https_server.HTTPSServer_PyCurl)) + return permutations |