Use cached shard ranges for container GETs

This patch makes four significant changes to the handling of GET requests for sharding or sharded containers: - container server GET requests may now result in the entire list of shard ranges being returned for the 'listing' state regardless of any request parameter constraints. - the proxy server may cache that list of shard ranges in memcache and the requests environ infocache dict, and subsequently use the cached shard ranges when handling GET requests for the same container. - the proxy now caches more container metadata so that it can synthesize a complete set of container GET response headers from cache. - the proxy server now enforces more container GET request validity checks that were previously only enforced by the backend server, e.g. checks for valid request parameter values With this change, when the proxy learns from container metadata that the container is sharded then it will cache shard ranges fetched from the backend during a container GET in memcache. On subsequent container GETs the proxy will use the cached shard ranges to gather object listings from shard containers, avoiding further GET requests to the root container until the cached shard ranges expire from cache. Cached shard ranges are most useful if they cover the entire object name space in the container. The proxy therefore uses a new X-Backend-Override-Shard-Name-Filter header to instruct the container server to ignore any request parameters that would constrain the returned shard range listing i.e. 'marker', 'end_marker', 'includes' and 'reverse' parameters. Having obtained the entire shard range listing (either from the server or from cache) the proxy now applies those request parameter constraints itself when constructing the client response. When using cached shard ranges the proxy will synthesize response headers from the container metadata that is also in cache. To enable the full set of container GET response headers to be synthezised in this way, the set of metadata that the proxy caches when handling a backend container GET response is expanded to include various timestamps. The X-Newest header may be used to disable looking up shard ranges in cache. Change-Id: I5fc696625d69d1ee9218ee2a508a1b9be6cf9685
author: Alistair Coles <alistairncoles@gmail.com> 2020-11-03 11:08:56 +0000
committer: Alistair Coles <alistairncoles@gmail.com> 2021-01-06 16:28:49 +0000
commit: 077ba77ea6f735c98709b60e1fc9aa9f0f90676a (patch)
tree: a09bf1aba38d6dda859aadf60e56fdd1f87c99d0 /swift/proxy/controllers/container.py
parent: 453983764727ba528222a413691a0f0fc5a8f8cf (diff)
download: swift-077ba77ea6f735c98709b60e1fc9aa9f0f90676a.tar.gz
1 files changed, 211 insertions, 34 deletions
diff --git a/swift/proxy/controllers/container.py b/swift/proxy/controllers/container.py
index 00e460171..19f631c45 100644
--- a/swift/proxy/controllers/container.py
+++ b/swift/proxy/controllers/container.py
@@ -15,21 +15,23 @@
 
 from swift import gettext_ as _
 import json
+import math
 
 import six
 from six.moves.urllib.parse import unquote
 
 from swift.common.utils import public, private, csv_append, Timestamp, \
-    config_true_value, ShardRange
+    config_true_value, ShardRange, cache_from_env, filter_shard_ranges
 from swift.common.constraints import check_metadata, CONTAINER_LISTING_LIMIT
 from swift.common.http import HTTP_ACCEPTED, is_success
-from swift.common.request_helpers import get_sys_meta_prefix
+from swift.common.request_helpers import get_sys_meta_prefix, get_param, \
+    constrain_req_limit, validate_container_params
 from swift.proxy.controllers.base import Controller, delay_denial, \
-    cors_validation, set_info_cache, clear_info_cache
+    cors_validation, set_info_cache, clear_info_cache, _get_info_from_caches, \
+    get_cache_key, headers_from_container_info, update_headers
 from swift.common.storage_policy import POLICIES
-from swift.common.swob import HTTPBadRequest, HTTPForbidden, \
-    HTTPNotFound, HTTPServiceUnavailable, str_to_wsgi, wsgi_to_str, \
-    bytes_to_wsgi
+from swift.common.swob import HTTPBadRequest, HTTPForbidden, HTTPNotFound, \
+    HTTPServiceUnavailable, str_to_wsgi, wsgi_to_str, bytes_to_wsgi, Response
 
 
 class ContainerController(Controller):
@@ -87,6 +89,144 @@ class ContainerController(Controller):
                         return HTTPBadRequest(request=req, body=str(err))
         return None
 
+    def _clear_container_info_cache(self, req):
+        clear_info_cache(self.app, req.environ,
+                         self.account_name, self.container_name)
+        clear_info_cache(self.app, req.environ,
+                         self.account_name, self.container_name, 'listing')
+        # TODO: should we also purge updating shards from cache?
+
+    def _GETorHEAD_from_backend(self, req):
+        part = self.app.container_ring.get_part(
+            self.account_name, self.container_name)
+        concurrency = self.app.container_ring.replica_count \
+            if self.app.get_policy_options(None).concurrent_gets else 1
+        node_iter = self.app.iter_nodes(self.app.container_ring, part)
+        resp = self.GETorHEAD_base(
+            req, _('Container'), node_iter, part,
+            req.swift_entity_path, concurrency)
+        return resp
+
+    def _filter_resp_shard_ranges(self, req, cached_ranges):
+        # filter returned shard ranges according to request constraints
+        marker = get_param(req, 'marker', '')
+        end_marker = get_param(req, 'end_marker')
+        includes = get_param(req, 'includes')
+        reverse = config_true_value(get_param(req, 'reverse'))
+        if reverse:
+            marker, end_marker = end_marker, marker
+        shard_ranges = [
+            ShardRange.from_dict(shard_range)
+            for shard_range in cached_ranges]
+        shard_ranges = filter_shard_ranges(shard_ranges, includes, marker,
+                                           end_marker)
+        if reverse:
+            shard_ranges.reverse()
+        return json.dumps([dict(sr) for sr in shard_ranges]).encode('ascii')
+
+    def _GET_using_cache(self, req):
+        # It may be possible to fulfil the request from cache: we only reach
+        # here if request record_type is 'shard' or 'auto', so if the container
+        # state is 'sharded' then look for cached shard ranges. However, if
+        # X-Newest is true then we always fetch from the backend servers.
+        get_newest = config_true_value(req.headers.get('x-newest', False))
+        if get_newest:
+            self.app.logger.debug(
+                'Skipping shard cache lookup (x-newest) for %s', req.path_qs)
+            info = None
+        else:
+            info = _get_info_from_caches(self.app, req.environ,
+                                         self.account_name,
+                                         self.container_name)
+        if (info and is_success(info['status']) and
+                info.get('sharding_state') == 'sharded'):
+            # container is sharded so we may have the shard ranges cached
+            headers = headers_from_container_info(info)
+            if headers:
+                # only use cached values if all required headers available
+                infocache = req.environ.setdefault('swift.infocache', {})
+                memcache = cache_from_env(req.environ, True)
+                cache_key = get_cache_key(self.account_name,
+                                          self.container_name,
+                                          shard='listing')
+                cached_ranges = infocache.get(cache_key)
+                if cached_ranges is None and memcache:
+                    cached_ranges = memcache.get(cache_key)
+                if cached_ranges is not None:
+                    infocache[cache_key] = tuple(cached_ranges)
+                    # shard ranges can be returned from cache
+                    self.app.logger.debug('Found %d shards in cache for %s',
+                                          len(cached_ranges), req.path_qs)
+                    headers.update({'x-backend-record-type': 'shard',
+                                    'x-backend-cached-results': 'true'})
+                    shard_range_body = self._filter_resp_shard_ranges(
+                        req, cached_ranges)
+                    # mimic GetOrHeadHandler.get_working_response...
+                    # note: server sets charset with content_type but proxy
+                    # GETorHEAD_base does not, so don't set it here either
+                    resp = Response(request=req, body=shard_range_body)
+                    update_headers(resp, headers)
+                    resp.last_modified = math.ceil(
+                        float(headers['x-put-timestamp']))
+                    resp.environ['swift_x_timestamp'] = headers.get(
+                        'x-timestamp')
+                    resp.accept_ranges = 'bytes'
+                    resp.content_type = 'application/json'
+                    return resp
+
+        # The request was not fulfilled from cache so send to the backend
+        # server, but instruct the backend server to ignore name constraints in
+        # request params if returning shard ranges so that the response can
+        # potentially be cached. Only do this if the container state is
+        # 'sharded'. We don't attempt to cache shard ranges for a 'sharding'
+        # container as they may include the container itself as a 'gap filler'
+        # for shard ranges that have not yet cleaved; listings from 'gap
+        # filler' shard ranges are likely to become stale as the container
+        # continues to cleave objects to its shards and caching them is
+        # therefore more likely to result in stale or incomplete listings on
+        # subsequent container GETs.
+        req.headers['x-backend-override-shard-name-filter'] = 'sharded'
+        resp = self._GETorHEAD_from_backend(req)
+
+        sharding_state = resp.headers.get(
+            'x-backend-sharding-state', '').lower()
+        resp_record_type = resp.headers.get(
+            'x-backend-record-type', '').lower()
+        complete_listing = config_true_value(resp.headers.pop(
+            'x-backend-override-shard-name-filter', False))
+        # given that we sent 'x-backend-override-shard-name-filter=sharded' we
+        # should only receive back 'x-backend-override-shard-name-filter=true'
+        # if the sharding state is 'sharded', but check them both anyway...
+        if (resp_record_type == 'shard' and
+                sharding_state == 'sharded' and
+                complete_listing):
+            # backend returned unfiltered listing state shard ranges so parse
+            # them and replace response body with filtered listing
+            cache_key = get_cache_key(self.account_name, self.container_name,
+                                      shard='listing')
+            data = self._parse_listing_response(req, resp)
+            backend_shard_ranges = self._parse_shard_ranges(req, data, resp)
+            if backend_shard_ranges is not None:
+                cached_ranges = [dict(sr) for sr in backend_shard_ranges]
+                if resp.headers.get('x-backend-sharding-state') == 'sharded':
+                    # cache in infocache even if no shard ranges returned; this
+                    # is unexpected but use that result for this request
+                    infocache = req.environ.setdefault('swift.infocache', {})
+                    infocache[cache_key] = tuple(cached_ranges)
+                    memcache = cache_from_env(req.environ, True)
+                    if memcache and cached_ranges:
+                        # cache in memcache only if shard ranges as expected
+                        self.app.logger.debug('Caching %d shards for %s',
+                                              len(cached_ranges), req.path_qs)
+                        memcache.set(
+                            cache_key, cached_ranges,
+                            time=self.app.recheck_listing_shard_ranges)
+
+                # filter returned shard ranges according to request constraints
+                resp.body = self._filter_resp_shard_ranges(req, cached_ranges)
+
+        return resp
+
     def GETorHEAD(self, req):
         """Handler for HTTP GET/HEAD requests."""
         ai = self.account_info(self.account_name, req)
@@ -102,33 +242,51 @@ class ContainerController(Controller):
             # Don't cache this. The lack of account will be cached, and that
             # is sufficient.
             return HTTPNotFound(request=req)
-        part = self.app.container_ring.get_part(
-            self.account_name, self.container_name)
-        concurrency = self.app.container_ring.replica_count \
-            if self.app.get_policy_options(None).concurrent_gets else 1
-        node_iter = self.app.iter_nodes(self.app.container_ring, part)
+
+        # The read-modify-write of params here is because the Request.params
+        # getter dynamically generates a dict of params from the query string;
+        # the setter must be called for new params to update the query string.
         params = req.params
         params['format'] = 'json'
+        # x-backend-record-type may be sent via internal client e.g. from
+        # the sharder or in probe tests
         record_type = req.headers.get('X-Backend-Record-Type', '').lower()
         if not record_type:
             record_type = 'auto'
             req.headers['X-Backend-Record-Type'] = 'auto'
             params['states'] = 'listing'
         req.params = params
-        resp = self.GETorHEAD_base(
-            req, _('Container'), node_iter, part,
-            req.swift_entity_path, concurrency)
+
+        memcache = cache_from_env(req.environ, True)
+        if (req.method == 'GET' and
+                record_type != 'object' and
+                self.app.recheck_listing_shard_ranges > 0 and
+                memcache and
+                get_param(req, 'states') == 'listing' and
+                not config_true_value(
+                    req.headers.get('x-backend-include-deleted', False))):
+            # This GET might be served from cache or might populate cache.
+            # 'x-backend-include-deleted' is not usually expected in requests
+            # to the proxy (it is used from sharder to container servers) but
+            # it is included in the conditions just in case because we don't
+            # cache deleted shard ranges.
+            resp = self._GET_using_cache(req)
+        else:
+            resp = self._GETorHEAD_from_backend(req)
+
         resp_record_type = resp.headers.get('X-Backend-Record-Type', '')
         if all((req.method == "GET", record_type == 'auto',
                resp_record_type.lower() == 'shard')):
             resp = self._get_from_shards(req, resp)
 
-        # Cache this. We just made a request to a storage node and got
-        # up-to-date information for the container.
-        resp.headers['X-Backend-Recheck-Container-Existence'] = str(
-            self.app.recheck_container_existence)
-        set_info_cache(self.app, req.environ, self.account_name,
-                       self.container_name, resp)
+        if not config_true_value(
+                resp.headers.get('X-Backend-Cached-Results')):
+            # Cache container metadata. We just made a request to a storage
+            # node and got up-to-date information for the container.
+            resp.headers['X-Backend-Recheck-Container-Existence'] = str(
+                self.app.recheck_container_existence)
+            set_info_cache(self.app, req.environ, self.account_name,
+                           self.container_name, resp)
         if 'swift.authorize' in req.environ:
             req.acl = resp.headers.get('x-container-read')
             aresp = req.environ['swift.authorize'](req)
@@ -171,7 +329,7 @@ class ContainerController(Controller):
             return resp
 
         objects = []
-        req_limit = int(req.params.get('limit') or CONTAINER_LISTING_LIMIT)
+        req_limit = constrain_req_limit(req, CONTAINER_LISTING_LIMIT)
         params = req.params.copy()
         params.pop('states', None)
         req.headers.pop('X-Backend-Record-Type', None)
@@ -181,7 +339,7 @@ class ContainerController(Controller):
         prefix = wsgi_to_str(params.get('prefix'))
 
         limit = req_limit
-        for shard_range in shard_ranges:
+        for i, shard_range in enumerate(shard_ranges):
             params['limit'] = limit
             # Always set marker to ensure that object names less than or equal
             # to those already in the listing are not fetched; if the listing
@@ -207,12 +365,13 @@ class ContainerController(Controller):
             else:
                 params['end_marker'] = str_to_wsgi(shard_range.end_marker)
 
+            headers = {}
             if ((shard_range.account, shard_range.container) in
                     shard_listing_history):
                 # directed back to same container - force GET of objects
-                headers = {'X-Backend-Record-Type': 'object'}
-            else:
-                headers = None
+                headers['X-Backend-Record-Type'] = 'object'
+            if config_true_value(req.headers.get('x-newest', False)):
+                headers['X-Newest'] = 'true'
 
             if prefix:
                 if prefix > shard_range:
@@ -225,14 +384,33 @@ class ContainerController(Controller):
                     if just_past < shard_range:
                         continue
 
-            self.app.logger.debug('Getting from %s %s with %s',
-                                  shard_range, shard_range.name, headers)
+            self.app.logger.debug(
+                'Getting listing part %d from shard %s %s with %s',
+                i, shard_range, shard_range.name, headers)
             objs, shard_resp = self._get_container_listing(
                 req, shard_range.account, shard_range.container,
                 headers=headers, params=params)
 
+            shard_state = 'unknown'
+            try:
+                shard_state = shard_resp.headers['x-backend-sharding-state']
+                shard_state = ShardRange.resolve_state(shard_state)
+            except (AttributeError, ValueError, KeyError):
+                pass
+
+            if objs is None:
+                # tolerate errors
+                self.app.logger.debug(
+                    'Failed to get objects from shard (state=%s), total = %d',
+                    shard_state, len(objects))
+                continue
+
+            self.app.logger.debug(
+                'Found %d objects in shard (state=%s), total = %d',
+                len(objs), shard_state, len(objs) + len(objects))
+
             if not objs:
-                # tolerate errors or empty shard containers
+                # tolerate empty shard containers
                 continue
 
             objects.extend(objs)
@@ -270,6 +448,8 @@ class ContainerController(Controller):
     @cors_validation
     def GET(self, req):
         """Handler for HTTP GET requests."""
+        # early checks for request validity
+        validate_container_params(req)
         return self.GETorHEAD(req)
 
     @public
@@ -328,8 +508,7 @@ class ContainerController(Controller):
         resp = self.make_requests(
             req, self.app.container_ring,
             container_partition, 'PUT', req.swift_entity_path, headers)
-        clear_info_cache(self.app, req.environ,
-                         self.account_name, self.container_name)
+        self._clear_container_info_cache(req)
         return resp
 
     @public
@@ -354,8 +533,7 @@ class ContainerController(Controller):
         container_partition, containers = self.app.container_ring.get_nodes(
             self.account_name, self.container_name)
         headers = self.generate_request_headers(req, transfer=True)
-        clear_info_cache(self.app, req.environ,
-                         self.account_name, self.container_name)
+        self._clear_container_info_cache(req)
         resp = self.make_requests(
             req, self.app.container_ring, container_partition, 'POST',
             req.swift_entity_path, [headers] * len(containers))
@@ -373,8 +551,7 @@ class ContainerController(Controller):
             self.account_name, self.container_name)
         headers = self._backend_requests(req, len(containers),
                                          account_partition, accounts)
-        clear_info_cache(self.app, req.environ,
-                         self.account_name, self.container_name)
+        self._clear_container_info_cache(req)
         resp = self.make_requests(
             req, self.app.container_ring, container_partition, 'DELETE',
             req.swift_entity_path, headers)
author	Alistair Coles <alistairncoles@gmail.com>	2020-11-03 11:08:56 +0000
committer	Alistair Coles <alistairncoles@gmail.com>	2021-01-06 16:28:49 +0000
commit	077ba77ea6f735c98709b60e1fc9aa9f0f90676a (patch)
tree	a09bf1aba38d6dda859aadf60e56fdd1f87c99d0 /swift/proxy/controllers/container.py
parent	453983764727ba528222a413691a0f0fc5a8f8cf (diff)
download	swift-077ba77ea6f735c98709b60e1fc9aa9f0f90676a.tar.gz