summaryrefslogtreecommitdiff
path: root/bzrlib/transport/http/_pycurl.py
diff options
context:
space:
mode:
Diffstat (limited to 'bzrlib/transport/http/_pycurl.py')
-rw-r--r--bzrlib/transport/http/_pycurl.py448
1 files changed, 448 insertions, 0 deletions
diff --git a/bzrlib/transport/http/_pycurl.py b/bzrlib/transport/http/_pycurl.py
new file mode 100644
index 0000000..612123d
--- /dev/null
+++ b/bzrlib/transport/http/_pycurl.py
@@ -0,0 +1,448 @@
+# Copyright (C) 2006-2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+"""http/https transport using pycurl"""
+
+from __future__ import absolute_import
+
+# TODO: test reporting of http errors
+#
+# TODO: Transport option to control caching of particular requests; broadly we
+# would want to offer "caching allowed" or "must revalidate", depending on
+# whether we expect a particular file will be modified after it's committed.
+# It's probably safer to just always revalidate. mbp 20060321
+
+# TODO: Some refactoring could be done to avoid the strange idiom
+# used to capture data and headers while setting up the request
+# (and having to pass 'header' to _curl_perform to handle
+# redirections) . This could be achieved by creating a
+# specialized Curl object and returning code, headers and data
+# from _curl_perform. Not done because we may deprecate pycurl in the
+# future -- vila 20070212
+
+from cStringIO import StringIO
+import httplib
+
+import bzrlib
+from bzrlib import (
+ debug,
+ errors,
+ trace,
+ )
+from bzrlib.transport.http import (
+ ca_bundle,
+ HttpTransportBase,
+ response,
+ unhtml_roughly,
+ )
+
+try:
+ import pycurl
+except ImportError, e:
+ trace.mutter("failed to import pycurl: %s", e)
+ raise errors.DependencyNotPresent('pycurl', e)
+
+try:
+ # see if we can actually initialize PyCurl - sometimes it will load but
+ # fail to start up due to this bug:
+ #
+ # 32. (At least on Windows) If libcurl is built with c-ares and there's
+ # no DNS server configured in the system, the ares_init() call fails and
+ # thus curl_easy_init() fails as well. This causes weird effects for
+ # people who use numerical IP addresses only.
+ #
+ # reported by Alexander Belchenko, 2006-04-26
+ pycurl.Curl()
+except pycurl.error, e:
+ trace.mutter("failed to initialize pycurl: %s", e)
+ raise errors.DependencyNotPresent('pycurl', e)
+
+
+
+
+def _get_pycurl_errcode(symbol, default):
+ """
+ Returns the numerical error code for a symbol defined by pycurl.
+
+ Different pycurl implementations define different symbols for error
+ codes. Old versions never define some symbols (wether they can return the
+ corresponding error code or not). The following addresses the problem by
+ defining the symbols we care about. Note: this allows to define symbols
+ for errors that older versions will never return, which is fine.
+ """
+ return pycurl.__dict__.get(symbol, default)
+
+CURLE_COULDNT_CONNECT = _get_pycurl_errcode('E_COULDNT_CONNECT', 7)
+CURLE_COULDNT_RESOLVE_HOST = _get_pycurl_errcode('E_COULDNT_RESOLVE_HOST', 6)
+CURLE_COULDNT_RESOLVE_PROXY = _get_pycurl_errcode('E_COULDNT_RESOLVE_PROXY', 5)
+CURLE_GOT_NOTHING = _get_pycurl_errcode('E_GOT_NOTHING', 52)
+CURLE_PARTIAL_FILE = _get_pycurl_errcode('E_PARTIAL_FILE', 18)
+CURLE_SEND_ERROR = _get_pycurl_errcode('E_SEND_ERROR', 55)
+CURLE_RECV_ERROR = _get_pycurl_errcode('E_RECV_ERROR', 56)
+CURLE_SSL_CACERT = _get_pycurl_errcode('E_SSL_CACERT', 60)
+CURLE_SSL_CACERT_BADFILE = _get_pycurl_errcode('E_SSL_CACERT_BADFILE', 77)
+
+
+class PyCurlTransport(HttpTransportBase):
+ """http client transport using pycurl
+
+ PyCurl is a Python binding to the C "curl" multiprotocol client.
+
+ This transport can be significantly faster than the builtin
+ Python client. Advantages include: DNS caching.
+ """
+
+ def __init__(self, base, _from_transport=None):
+ super(PyCurlTransport, self).__init__(base, 'pycurl',
+ _from_transport=_from_transport)
+ if self._unqualified_scheme == 'https':
+ # Check availability of https into pycurl supported
+ # protocols
+ supported = pycurl.version_info()[8]
+ if 'https' not in supported:
+ raise errors.DependencyNotPresent('pycurl', 'no https support')
+ self.cabundle = ca_bundle.get_ca_path()
+
+ def _get_curl(self):
+ connection = self._get_connection()
+ if connection is None:
+ # First connection ever. There is no credentials for pycurl, either
+ # the password was embedded in the URL or it's not needed. The
+ # connection for pycurl is just the Curl object, it will not
+ # connect to the http server until the first request (which had
+ # just called us).
+ connection = pycurl.Curl()
+ # First request, initialize credentials.
+ auth = self._create_auth()
+ # Proxy handling is out of reach, so we punt
+ self._set_connection(connection, auth)
+ return connection
+
+ def disconnect(self):
+ connection = self._get_connection()
+ if connection is not None:
+ connection.close()
+
+ def has(self, relpath):
+ """See Transport.has()"""
+ # We set NO BODY=0 in _get_full, so it should be safe
+ # to re-use the non-range curl object
+ curl = self._get_curl()
+ abspath = self._remote_path(relpath)
+ curl.setopt(pycurl.URL, abspath)
+ self._set_curl_options(curl)
+ curl.setopt(pycurl.HTTPGET, 1)
+ # don't want the body - ie just do a HEAD request
+ # This means "NO BODY" not 'nobody'
+ curl.setopt(pycurl.NOBODY, 1)
+ # But we need headers to handle redirections
+ header = StringIO()
+ curl.setopt(pycurl.HEADERFUNCTION, header.write)
+ # In some erroneous cases, pycurl will emit text on
+ # stdout if we don't catch it (see InvalidStatus tests
+ # for one such occurrence).
+ blackhole = StringIO()
+ curl.setopt(pycurl.WRITEFUNCTION, blackhole.write)
+ self._curl_perform(curl, header)
+ code = curl.getinfo(pycurl.HTTP_CODE)
+ if code == 404: # not found
+ return False
+ elif code == 200: # "ok"
+ return True
+ else:
+ self._raise_curl_http_error(curl)
+
+ def _get(self, relpath, offsets, tail_amount=0):
+ # This just switches based on the type of request
+ if offsets is not None or tail_amount not in (0, None):
+ return self._get_ranged(relpath, offsets, tail_amount=tail_amount)
+ else:
+ return self._get_full(relpath)
+
+ def _setup_get_request(self, curl, relpath):
+ # Make sure we do a GET request. versions > 7.14.1 also set the
+ # NO BODY flag, but we'll do it ourselves in case it is an older
+ # pycurl version
+ curl.setopt(pycurl.NOBODY, 0)
+ curl.setopt(pycurl.HTTPGET, 1)
+ return self._setup_request(curl, relpath)
+
+ def _setup_request(self, curl, relpath):
+ """Do the common setup stuff for making a request
+
+ :param curl: The curl object to place the request on
+ :param relpath: The relative path that we want to get
+ :return: (abspath, data, header)
+ abspath: full url
+ data: file that will be filled with the body
+ header: file that will be filled with the headers
+ """
+ abspath = self._remote_path(relpath)
+ curl.setopt(pycurl.URL, abspath)
+ self._set_curl_options(curl)
+
+ data = StringIO()
+ header = StringIO()
+ curl.setopt(pycurl.WRITEFUNCTION, data.write)
+ curl.setopt(pycurl.HEADERFUNCTION, header.write)
+
+ return abspath, data, header
+
+ def _get_full(self, relpath):
+ """Make a request for the entire file"""
+ curl = self._get_curl()
+ abspath, data, header = self._setup_get_request(curl, relpath)
+ self._curl_perform(curl, header)
+
+ code = curl.getinfo(pycurl.HTTP_CODE)
+ data.seek(0)
+
+ if code == 404:
+ raise errors.NoSuchFile(abspath)
+ if code != 200:
+ self._raise_curl_http_error(
+ curl, 'expected 200 or 404 for full response.')
+
+ return code, data
+
+ # The parent class use 0 to minimize the requests, but since we can't
+ # exploit the results as soon as they are received (pycurl limitation) we'd
+ # better issue more requests and provide a more responsive UI incurring
+ # more latency costs.
+ # If you modify this, think about modifying the comment in http/__init__.py
+ # too.
+ _get_max_size = 4 * 1024 * 1024
+
+ def _get_ranged(self, relpath, offsets, tail_amount):
+ """Make a request for just part of the file."""
+ curl = self._get_curl()
+ abspath, data, header = self._setup_get_request(curl, relpath)
+
+ range_header = self._attempted_range_header(offsets, tail_amount)
+ if range_header is None:
+ # Forget ranges, the server can't handle them
+ return self._get_full(relpath)
+
+ self._curl_perform(curl, header, ['Range: bytes=%s' % range_header])
+ data.seek(0)
+
+ code = curl.getinfo(pycurl.HTTP_CODE)
+
+ if code == 404: # not found
+ raise errors.NoSuchFile(abspath)
+ elif code in (400, 416):
+ # We don't know which, but one of the ranges we specified was
+ # wrong.
+ raise errors.InvalidHttpRange(abspath, range_header,
+ 'Server return code %d'
+ % curl.getinfo(pycurl.HTTP_CODE))
+ msg = self._parse_headers(header)
+ return code, response.handle_response(abspath, code, msg, data)
+
+ def _parse_headers(self, status_and_headers):
+ """Transform the headers provided by curl into an HTTPMessage"""
+ status_and_headers.seek(0)
+ # Ignore status line
+ status_and_headers.readline()
+ msg = httplib.HTTPMessage(status_and_headers)
+ return msg
+
+ def _post(self, body_bytes):
+ curl = self._get_curl()
+ abspath, data, header = self._setup_request(curl, '.bzr/smart')
+ curl.setopt(pycurl.POST, 1)
+ fake_file = StringIO(body_bytes)
+ curl.setopt(pycurl.POSTFIELDSIZE, len(body_bytes))
+ curl.setopt(pycurl.READFUNCTION, fake_file.read)
+ # We override the Expect: header so that pycurl will send the POST
+ # body immediately.
+ try:
+ self._curl_perform(curl, header,
+ ['Expect: ',
+ 'Content-Type: application/octet-stream'])
+ except pycurl.error, e:
+ if e[0] == CURLE_SEND_ERROR:
+ # When talking to an HTTP/1.0 server, getting a 400+ error code
+ # triggers a bug in some combinations of curl/kernel in rare
+ # occurrences. Basically, the server closes the connection
+ # after sending the error but the client (having received and
+ # parsed the response) still try to send the request body (see
+ # bug #225020 and its upstream associated bug). Since the
+ # error code and the headers are known to be available, we just
+ # swallow the exception, leaving the upper levels handle the
+ # 400+ error.
+ trace.mutter('got pycurl error in POST: %s, %s, %s, url: %s ',
+ e[0], e[1], e, abspath)
+ else:
+ # Re-raise otherwise
+ raise
+ data.seek(0)
+ code = curl.getinfo(pycurl.HTTP_CODE)
+ msg = self._parse_headers(header)
+ return code, response.handle_response(abspath, code, msg, data)
+
+
+ def _raise_curl_http_error(self, curl, info=None, body=None):
+ """Common curl->bzrlib error translation.
+
+ Some methods may choose to override this for particular cases.
+
+ The URL and code are automatically included as appropriate.
+
+ :param info: Extra information to include in the message.
+
+ :param body: File-like object from which the body of the page can be
+ read.
+ """
+ code = curl.getinfo(pycurl.HTTP_CODE)
+ url = curl.getinfo(pycurl.EFFECTIVE_URL)
+ if body is not None:
+ response_body = body.read()
+ plaintext_body = unhtml_roughly(response_body)
+ else:
+ response_body = None
+ plaintext_body = ''
+ if code == 403:
+ raise errors.TransportError(
+ 'Server refuses to fulfill the request (403 Forbidden)'
+ ' for %s: %s' % (url, plaintext_body))
+ else:
+ if info is None:
+ msg = ''
+ else:
+ msg = ': ' + info
+ raise errors.InvalidHttpResponse(
+ url, 'Unable to handle http code %d%s: %s'
+ % (code, msg, plaintext_body))
+
+ def _debug_cb(self, kind, text):
+ if kind in (pycurl.INFOTYPE_HEADER_IN, pycurl.INFOTYPE_DATA_IN):
+ self._report_activity(len(text), 'read')
+ if (kind == pycurl.INFOTYPE_HEADER_IN
+ and 'http' in debug.debug_flags):
+ trace.mutter('< %s' % (text.rstrip(),))
+ elif kind in (pycurl.INFOTYPE_HEADER_OUT, pycurl.INFOTYPE_DATA_OUT):
+ self._report_activity(len(text), 'write')
+ if (kind == pycurl.INFOTYPE_HEADER_OUT
+ and 'http' in debug.debug_flags):
+ lines = []
+ for line in text.rstrip().splitlines():
+ # People are often told to paste -Dhttp output to help
+ # debug. Don't compromise credentials.
+ try:
+ header, details = line.split(':', 1)
+ except ValueError:
+ header = None
+ if header in ('Authorization', 'Proxy-Authorization'):
+ line = '%s: <masked>' % (header,)
+ lines.append(line)
+ trace.mutter('> ' + '\n> '.join(lines))
+ elif kind == pycurl.INFOTYPE_TEXT and 'http' in debug.debug_flags:
+ trace.mutter('* %s' % text.rstrip())
+ elif (kind in (pycurl.INFOTYPE_TEXT, pycurl.INFOTYPE_SSL_DATA_IN,
+ pycurl.INFOTYPE_SSL_DATA_OUT)
+ and 'http' in debug.debug_flags):
+ trace.mutter('* %s' % text)
+
+ def _set_curl_options(self, curl):
+ """Set options for all requests"""
+ ua_str = 'bzr/%s (pycurl: %s)' % (bzrlib.__version__, pycurl.version)
+ curl.setopt(pycurl.USERAGENT, ua_str)
+ curl.setopt(pycurl.VERBOSE, 1)
+ curl.setopt(pycurl.DEBUGFUNCTION, self._debug_cb)
+ if self.cabundle:
+ curl.setopt(pycurl.CAINFO, self.cabundle)
+ # Set accepted auth methods
+ curl.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
+ curl.setopt(pycurl.PROXYAUTH, pycurl.HTTPAUTH_ANY)
+ auth = self._get_credentials()
+ user = auth.get('user', None)
+ password = auth.get('password', None)
+ userpass = None
+ if user is not None:
+ userpass = user + ':'
+ if password is not None: # '' is a valid password
+ userpass += password
+ curl.setopt(pycurl.USERPWD, userpass)
+
+ def _curl_perform(self, curl, header, more_headers=[]):
+ """Perform curl operation and translate exceptions."""
+ try:
+ # There's no way in http/1.0 to say "must
+ # revalidate"; we don't want to force it to always
+ # retrieve. so just turn off the default Pragma
+ # provided by Curl.
+ headers = ['Cache-control: max-age=0',
+ 'Pragma: no-cache',
+ 'Connection: Keep-Alive']
+ curl.setopt(pycurl.HTTPHEADER, headers + more_headers)
+ curl.perform()
+ except pycurl.error, e:
+ url = curl.getinfo(pycurl.EFFECTIVE_URL)
+ trace.mutter('got pycurl error: %s, %s, %s, url: %s ',
+ e[0], e[1], e, url)
+ if e[0] in (CURLE_COULDNT_RESOLVE_HOST,
+ CURLE_COULDNT_RESOLVE_PROXY,
+ CURLE_COULDNT_CONNECT,
+ CURLE_GOT_NOTHING,
+ CURLE_SSL_CACERT,
+ CURLE_SSL_CACERT_BADFILE,
+ ):
+ raise errors.ConnectionError(
+ 'curl connection error (%s)\non %s' % (e[1], url))
+ elif e[0] == CURLE_RECV_ERROR:
+ raise errors.ConnectionReset(
+ 'curl connection error (%s)\non %s' % (e[1], url))
+ elif e[0] == CURLE_PARTIAL_FILE:
+ # Pycurl itself has detected a short read. We do not have all
+ # the information for the ShortReadvError, but that should be
+ # enough
+ raise errors.ShortReadvError(url,
+ offset='unknown', length='unknown',
+ actual='unknown',
+ extra='Server aborted the request')
+ raise
+ code = curl.getinfo(pycurl.HTTP_CODE)
+ if code in (301, 302, 303, 307):
+ url = curl.getinfo(pycurl.EFFECTIVE_URL)
+ msg = self._parse_headers(header)
+ redirected_to = msg.getheader('location')
+ raise errors.RedirectRequested(url,
+ redirected_to,
+ is_permanent=(code == 301))
+
+
+def get_test_permutations():
+ """Return the permutations to be used in testing."""
+ from bzrlib.tests import features
+ from bzrlib.tests import http_server
+ permutations = [(PyCurlTransport, http_server.HttpServer_PyCurl),]
+ if features.HTTPSServerFeature.available():
+ from bzrlib.tests import (
+ https_server,
+ ssl_certs,
+ )
+
+ class HTTPS_pycurl_transport(PyCurlTransport):
+
+ def __init__(self, base, _from_transport=None):
+ super(HTTPS_pycurl_transport, self).__init__(base,
+ _from_transport)
+ self.cabundle = str(ssl_certs.build_path('ca.crt'))
+
+ permutations.append((HTTPS_pycurl_transport,
+ https_server.HTTPSServer_PyCurl))
+ return permutations