diff options
author | Alistair Coles <alistairncoles@gmail.com> | 2020-11-03 11:08:56 +0000 |
---|---|---|
committer | Alistair Coles <alistairncoles@gmail.com> | 2021-01-06 16:28:49 +0000 |
commit | 077ba77ea6f735c98709b60e1fc9aa9f0f90676a (patch) | |
tree | a09bf1aba38d6dda859aadf60e56fdd1f87c99d0 /swift/proxy/controllers/container.py | |
parent | 453983764727ba528222a413691a0f0fc5a8f8cf (diff) | |
download | swift-077ba77ea6f735c98709b60e1fc9aa9f0f90676a.tar.gz |
Use cached shard ranges for container GETs
This patch makes four significant changes to the handling of GET
requests for sharding or sharded containers:
- container server GET requests may now result in the entire list of
shard ranges being returned for the 'listing' state regardless of
any request parameter constraints.
- the proxy server may cache that list of shard ranges in memcache
and the requests environ infocache dict, and subsequently use the
cached shard ranges when handling GET requests for the same
container.
- the proxy now caches more container metadata so that it can
synthesize a complete set of container GET response headers from
cache.
- the proxy server now enforces more container GET request validity
checks that were previously only enforced by the backend server,
e.g. checks for valid request parameter values
With this change, when the proxy learns from container metadata
that the container is sharded then it will cache shard
ranges fetched from the backend during a container GET in memcache.
On subsequent container GETs the proxy will use the cached shard
ranges to gather object listings from shard containers, avoiding
further GET requests to the root container until the cached shard
ranges expire from cache.
Cached shard ranges are most useful if they cover the entire object
name space in the container. The proxy therefore uses a new
X-Backend-Override-Shard-Name-Filter header to instruct the container
server to ignore any request parameters that would constrain the
returned shard range listing i.e. 'marker', 'end_marker', 'includes'
and 'reverse' parameters. Having obtained the entire shard range
listing (either from the server or from cache) the proxy now applies
those request parameter constraints itself when constructing the
client response.
When using cached shard ranges the proxy will synthesize response
headers from the container metadata that is also in cache. To enable
the full set of container GET response headers to be synthezised in
this way, the set of metadata that the proxy caches when handling a
backend container GET response is expanded to include various
timestamps.
The X-Newest header may be used to disable looking up shard ranges
in cache.
Change-Id: I5fc696625d69d1ee9218ee2a508a1b9be6cf9685
Diffstat (limited to 'swift/proxy/controllers/container.py')
-rw-r--r-- | swift/proxy/controllers/container.py | 245 |
1 files changed, 211 insertions, 34 deletions
diff --git a/swift/proxy/controllers/container.py b/swift/proxy/controllers/container.py index 00e460171..19f631c45 100644 --- a/swift/proxy/controllers/container.py +++ b/swift/proxy/controllers/container.py @@ -15,21 +15,23 @@ from swift import gettext_ as _ import json +import math import six from six.moves.urllib.parse import unquote from swift.common.utils import public, private, csv_append, Timestamp, \ - config_true_value, ShardRange + config_true_value, ShardRange, cache_from_env, filter_shard_ranges from swift.common.constraints import check_metadata, CONTAINER_LISTING_LIMIT from swift.common.http import HTTP_ACCEPTED, is_success -from swift.common.request_helpers import get_sys_meta_prefix +from swift.common.request_helpers import get_sys_meta_prefix, get_param, \ + constrain_req_limit, validate_container_params from swift.proxy.controllers.base import Controller, delay_denial, \ - cors_validation, set_info_cache, clear_info_cache + cors_validation, set_info_cache, clear_info_cache, _get_info_from_caches, \ + get_cache_key, headers_from_container_info, update_headers from swift.common.storage_policy import POLICIES -from swift.common.swob import HTTPBadRequest, HTTPForbidden, \ - HTTPNotFound, HTTPServiceUnavailable, str_to_wsgi, wsgi_to_str, \ - bytes_to_wsgi +from swift.common.swob import HTTPBadRequest, HTTPForbidden, HTTPNotFound, \ + HTTPServiceUnavailable, str_to_wsgi, wsgi_to_str, bytes_to_wsgi, Response class ContainerController(Controller): @@ -87,6 +89,144 @@ class ContainerController(Controller): return HTTPBadRequest(request=req, body=str(err)) return None + def _clear_container_info_cache(self, req): + clear_info_cache(self.app, req.environ, + self.account_name, self.container_name) + clear_info_cache(self.app, req.environ, + self.account_name, self.container_name, 'listing') + # TODO: should we also purge updating shards from cache? + + def _GETorHEAD_from_backend(self, req): + part = self.app.container_ring.get_part( + self.account_name, self.container_name) + concurrency = self.app.container_ring.replica_count \ + if self.app.get_policy_options(None).concurrent_gets else 1 + node_iter = self.app.iter_nodes(self.app.container_ring, part) + resp = self.GETorHEAD_base( + req, _('Container'), node_iter, part, + req.swift_entity_path, concurrency) + return resp + + def _filter_resp_shard_ranges(self, req, cached_ranges): + # filter returned shard ranges according to request constraints + marker = get_param(req, 'marker', '') + end_marker = get_param(req, 'end_marker') + includes = get_param(req, 'includes') + reverse = config_true_value(get_param(req, 'reverse')) + if reverse: + marker, end_marker = end_marker, marker + shard_ranges = [ + ShardRange.from_dict(shard_range) + for shard_range in cached_ranges] + shard_ranges = filter_shard_ranges(shard_ranges, includes, marker, + end_marker) + if reverse: + shard_ranges.reverse() + return json.dumps([dict(sr) for sr in shard_ranges]).encode('ascii') + + def _GET_using_cache(self, req): + # It may be possible to fulfil the request from cache: we only reach + # here if request record_type is 'shard' or 'auto', so if the container + # state is 'sharded' then look for cached shard ranges. However, if + # X-Newest is true then we always fetch from the backend servers. + get_newest = config_true_value(req.headers.get('x-newest', False)) + if get_newest: + self.app.logger.debug( + 'Skipping shard cache lookup (x-newest) for %s', req.path_qs) + info = None + else: + info = _get_info_from_caches(self.app, req.environ, + self.account_name, + self.container_name) + if (info and is_success(info['status']) and + info.get('sharding_state') == 'sharded'): + # container is sharded so we may have the shard ranges cached + headers = headers_from_container_info(info) + if headers: + # only use cached values if all required headers available + infocache = req.environ.setdefault('swift.infocache', {}) + memcache = cache_from_env(req.environ, True) + cache_key = get_cache_key(self.account_name, + self.container_name, + shard='listing') + cached_ranges = infocache.get(cache_key) + if cached_ranges is None and memcache: + cached_ranges = memcache.get(cache_key) + if cached_ranges is not None: + infocache[cache_key] = tuple(cached_ranges) + # shard ranges can be returned from cache + self.app.logger.debug('Found %d shards in cache for %s', + len(cached_ranges), req.path_qs) + headers.update({'x-backend-record-type': 'shard', + 'x-backend-cached-results': 'true'}) + shard_range_body = self._filter_resp_shard_ranges( + req, cached_ranges) + # mimic GetOrHeadHandler.get_working_response... + # note: server sets charset with content_type but proxy + # GETorHEAD_base does not, so don't set it here either + resp = Response(request=req, body=shard_range_body) + update_headers(resp, headers) + resp.last_modified = math.ceil( + float(headers['x-put-timestamp'])) + resp.environ['swift_x_timestamp'] = headers.get( + 'x-timestamp') + resp.accept_ranges = 'bytes' + resp.content_type = 'application/json' + return resp + + # The request was not fulfilled from cache so send to the backend + # server, but instruct the backend server to ignore name constraints in + # request params if returning shard ranges so that the response can + # potentially be cached. Only do this if the container state is + # 'sharded'. We don't attempt to cache shard ranges for a 'sharding' + # container as they may include the container itself as a 'gap filler' + # for shard ranges that have not yet cleaved; listings from 'gap + # filler' shard ranges are likely to become stale as the container + # continues to cleave objects to its shards and caching them is + # therefore more likely to result in stale or incomplete listings on + # subsequent container GETs. + req.headers['x-backend-override-shard-name-filter'] = 'sharded' + resp = self._GETorHEAD_from_backend(req) + + sharding_state = resp.headers.get( + 'x-backend-sharding-state', '').lower() + resp_record_type = resp.headers.get( + 'x-backend-record-type', '').lower() + complete_listing = config_true_value(resp.headers.pop( + 'x-backend-override-shard-name-filter', False)) + # given that we sent 'x-backend-override-shard-name-filter=sharded' we + # should only receive back 'x-backend-override-shard-name-filter=true' + # if the sharding state is 'sharded', but check them both anyway... + if (resp_record_type == 'shard' and + sharding_state == 'sharded' and + complete_listing): + # backend returned unfiltered listing state shard ranges so parse + # them and replace response body with filtered listing + cache_key = get_cache_key(self.account_name, self.container_name, + shard='listing') + data = self._parse_listing_response(req, resp) + backend_shard_ranges = self._parse_shard_ranges(req, data, resp) + if backend_shard_ranges is not None: + cached_ranges = [dict(sr) for sr in backend_shard_ranges] + if resp.headers.get('x-backend-sharding-state') == 'sharded': + # cache in infocache even if no shard ranges returned; this + # is unexpected but use that result for this request + infocache = req.environ.setdefault('swift.infocache', {}) + infocache[cache_key] = tuple(cached_ranges) + memcache = cache_from_env(req.environ, True) + if memcache and cached_ranges: + # cache in memcache only if shard ranges as expected + self.app.logger.debug('Caching %d shards for %s', + len(cached_ranges), req.path_qs) + memcache.set( + cache_key, cached_ranges, + time=self.app.recheck_listing_shard_ranges) + + # filter returned shard ranges according to request constraints + resp.body = self._filter_resp_shard_ranges(req, cached_ranges) + + return resp + def GETorHEAD(self, req): """Handler for HTTP GET/HEAD requests.""" ai = self.account_info(self.account_name, req) @@ -102,33 +242,51 @@ class ContainerController(Controller): # Don't cache this. The lack of account will be cached, and that # is sufficient. return HTTPNotFound(request=req) - part = self.app.container_ring.get_part( - self.account_name, self.container_name) - concurrency = self.app.container_ring.replica_count \ - if self.app.get_policy_options(None).concurrent_gets else 1 - node_iter = self.app.iter_nodes(self.app.container_ring, part) + + # The read-modify-write of params here is because the Request.params + # getter dynamically generates a dict of params from the query string; + # the setter must be called for new params to update the query string. params = req.params params['format'] = 'json' + # x-backend-record-type may be sent via internal client e.g. from + # the sharder or in probe tests record_type = req.headers.get('X-Backend-Record-Type', '').lower() if not record_type: record_type = 'auto' req.headers['X-Backend-Record-Type'] = 'auto' params['states'] = 'listing' req.params = params - resp = self.GETorHEAD_base( - req, _('Container'), node_iter, part, - req.swift_entity_path, concurrency) + + memcache = cache_from_env(req.environ, True) + if (req.method == 'GET' and + record_type != 'object' and + self.app.recheck_listing_shard_ranges > 0 and + memcache and + get_param(req, 'states') == 'listing' and + not config_true_value( + req.headers.get('x-backend-include-deleted', False))): + # This GET might be served from cache or might populate cache. + # 'x-backend-include-deleted' is not usually expected in requests + # to the proxy (it is used from sharder to container servers) but + # it is included in the conditions just in case because we don't + # cache deleted shard ranges. + resp = self._GET_using_cache(req) + else: + resp = self._GETorHEAD_from_backend(req) + resp_record_type = resp.headers.get('X-Backend-Record-Type', '') if all((req.method == "GET", record_type == 'auto', resp_record_type.lower() == 'shard')): resp = self._get_from_shards(req, resp) - # Cache this. We just made a request to a storage node and got - # up-to-date information for the container. - resp.headers['X-Backend-Recheck-Container-Existence'] = str( - self.app.recheck_container_existence) - set_info_cache(self.app, req.environ, self.account_name, - self.container_name, resp) + if not config_true_value( + resp.headers.get('X-Backend-Cached-Results')): + # Cache container metadata. We just made a request to a storage + # node and got up-to-date information for the container. + resp.headers['X-Backend-Recheck-Container-Existence'] = str( + self.app.recheck_container_existence) + set_info_cache(self.app, req.environ, self.account_name, + self.container_name, resp) if 'swift.authorize' in req.environ: req.acl = resp.headers.get('x-container-read') aresp = req.environ['swift.authorize'](req) @@ -171,7 +329,7 @@ class ContainerController(Controller): return resp objects = [] - req_limit = int(req.params.get('limit') or CONTAINER_LISTING_LIMIT) + req_limit = constrain_req_limit(req, CONTAINER_LISTING_LIMIT) params = req.params.copy() params.pop('states', None) req.headers.pop('X-Backend-Record-Type', None) @@ -181,7 +339,7 @@ class ContainerController(Controller): prefix = wsgi_to_str(params.get('prefix')) limit = req_limit - for shard_range in shard_ranges: + for i, shard_range in enumerate(shard_ranges): params['limit'] = limit # Always set marker to ensure that object names less than or equal # to those already in the listing are not fetched; if the listing @@ -207,12 +365,13 @@ class ContainerController(Controller): else: params['end_marker'] = str_to_wsgi(shard_range.end_marker) + headers = {} if ((shard_range.account, shard_range.container) in shard_listing_history): # directed back to same container - force GET of objects - headers = {'X-Backend-Record-Type': 'object'} - else: - headers = None + headers['X-Backend-Record-Type'] = 'object' + if config_true_value(req.headers.get('x-newest', False)): + headers['X-Newest'] = 'true' if prefix: if prefix > shard_range: @@ -225,14 +384,33 @@ class ContainerController(Controller): if just_past < shard_range: continue - self.app.logger.debug('Getting from %s %s with %s', - shard_range, shard_range.name, headers) + self.app.logger.debug( + 'Getting listing part %d from shard %s %s with %s', + i, shard_range, shard_range.name, headers) objs, shard_resp = self._get_container_listing( req, shard_range.account, shard_range.container, headers=headers, params=params) + shard_state = 'unknown' + try: + shard_state = shard_resp.headers['x-backend-sharding-state'] + shard_state = ShardRange.resolve_state(shard_state) + except (AttributeError, ValueError, KeyError): + pass + + if objs is None: + # tolerate errors + self.app.logger.debug( + 'Failed to get objects from shard (state=%s), total = %d', + shard_state, len(objects)) + continue + + self.app.logger.debug( + 'Found %d objects in shard (state=%s), total = %d', + len(objs), shard_state, len(objs) + len(objects)) + if not objs: - # tolerate errors or empty shard containers + # tolerate empty shard containers continue objects.extend(objs) @@ -270,6 +448,8 @@ class ContainerController(Controller): @cors_validation def GET(self, req): """Handler for HTTP GET requests.""" + # early checks for request validity + validate_container_params(req) return self.GETorHEAD(req) @public @@ -328,8 +508,7 @@ class ContainerController(Controller): resp = self.make_requests( req, self.app.container_ring, container_partition, 'PUT', req.swift_entity_path, headers) - clear_info_cache(self.app, req.environ, - self.account_name, self.container_name) + self._clear_container_info_cache(req) return resp @public @@ -354,8 +533,7 @@ class ContainerController(Controller): container_partition, containers = self.app.container_ring.get_nodes( self.account_name, self.container_name) headers = self.generate_request_headers(req, transfer=True) - clear_info_cache(self.app, req.environ, - self.account_name, self.container_name) + self._clear_container_info_cache(req) resp = self.make_requests( req, self.app.container_ring, container_partition, 'POST', req.swift_entity_path, [headers] * len(containers)) @@ -373,8 +551,7 @@ class ContainerController(Controller): self.account_name, self.container_name) headers = self._backend_requests(req, len(containers), account_partition, accounts) - clear_info_cache(self.app, req.environ, - self.account_name, self.container_name) + self._clear_container_info_cache(req) resp = self.make_requests( req, self.app.container_ring, container_partition, 'DELETE', req.swift_entity_path, headers) |