swift/container/replicator.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

# Copyright (c) 2010-2012 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import itertools
import json
from collections import defaultdict
from eventlet import Timeout

from swift.container.sync_store import ContainerSyncStore
from swift.container.backend import ContainerBroker, DATADIR, SHARDED
from swift.container.reconciler import (
    MISPLACED_OBJECTS_ACCOUNT, incorrect_policy_index,
    get_reconciler_container_name, get_row_to_q_entry_translator)
from swift.common import db_replicator
from swift.common.storage_policy import POLICIES
from swift.common.swob import HTTPOk, HTTPAccepted
from swift.common.exceptions import DeviceUnavailable
from swift.common.http import is_success
from swift.common.utils import Timestamp, majority_size, get_db_files


class ContainerReplicator(db_replicator.Replicator):
    server_type = 'container'
    brokerclass = ContainerBroker
    datadir = DATADIR
    default_port = 6201

    def __init__(self, conf, logger=None):
        super(ContainerReplicator, self).__init__(conf, logger=logger)
        self.reconciler_cleanups = self.sync_store = None

    def report_up_to_date(self, full_info):
        reported_key_map = {
            'reported_put_timestamp': 'put_timestamp',
            'reported_delete_timestamp': 'delete_timestamp',
            'reported_bytes_used': 'bytes_used',
            'reported_object_count': 'count',
        }
        for reported, value_key in reported_key_map.items():
            if full_info[reported] != full_info[value_key]:
                return False
        return True

    def _gather_sync_args(self, replication_info):
        parent = super(ContainerReplicator, self)
        sync_args = parent._gather_sync_args(replication_info)
        if len(POLICIES) > 1:
            sync_args += tuple(replication_info[k] for k in
                               ('status_changed_at', 'count',
                                'storage_policy_index'))
        return sync_args

    def _handle_sync_response(self, node, response, info, broker, http,
                              different_region=False):
        if is_success(response.status):
            remote_info = json.loads(response.data.decode('ascii'))
            if incorrect_policy_index(info, remote_info):
                status_changed_at = Timestamp.now()
                broker.set_storage_policy_index(
                    remote_info['storage_policy_index'],
                    timestamp=status_changed_at.internal)
            sync_timestamps = ('created_at', 'put_timestamp',
                               'delete_timestamp')
            if any(info[key] != remote_info[key] for key in sync_timestamps):
                broker.merge_timestamps(*(remote_info[key] for key in
                                          sync_timestamps))

            if remote_info.get('shard_max_row', -1) >= 0:
                # Grab remote's shard ranges, too
                self._fetch_and_merge_shard_ranges(http, broker)

        return super(ContainerReplicator, self)._handle_sync_response(
            node, response, info, broker, http, different_region)

    def _sync_shard_ranges(self, broker, http, local_id):
        # TODO: currently the number of shard ranges is expected to be _much_
        # less than normal objects so all are sync'd on each cycle. However, in
        # future there should be sync points maintained much like for object
        # syncing so that only new shard range rows are sync'd.
        shard_range_data = broker.get_all_shard_range_data()
        if shard_range_data:
            if not self._send_replicate_request(
                    http, 'merge_shard_ranges', shard_range_data, local_id):
                return False
            self.logger.debug('%s synced %s shard ranges to %s',
                              broker.db_file, len(shard_range_data),
                              '%(ip)s:%(port)s/%(device)s' % http.node)
        return True

    def _choose_replication_mode(self, node, rinfo, info, local_sync, broker,
                                 http, different_region):
        if 'shard_max_row' in rinfo:
            # Always replicate shard ranges to new-enough swift
            shard_range_success = self._sync_shard_ranges(
                broker, http, info['id'])
        else:
            shard_range_success = False
            self.logger.warning(
                '%s is unable to replicate shard ranges to peer %s; '
                'peer may need upgrading', broker.db_file,
                '%(ip)s:%(port)s/%(device)s' % node)
        if broker.sharding_initiated():
            if info['db_state'] == SHARDED and len(
                    broker.get_objects(limit=1)) == 0:
                self.logger.debug('%s is sharded and has nothing more to '
                                  'replicate to peer %s',
                                  broker.db_file,
                                  '%(ip)s:%(port)s/%(device)s' % node)
            else:
                # Only print the scary warning if there was something that
                # didn't get replicated
                self.logger.warning(
                    '%s is able to shard -- refusing to replicate objects to '
                    'peer %s; have shard ranges and will wait for cleaving',
                    broker.db_file,
                    '%(ip)s:%(port)s/%(device)s' % node)
            self.stats['deferred'] += 1
            return shard_range_success

        success = super(ContainerReplicator, self)._choose_replication_mode(
            node, rinfo, info, local_sync, broker, http,
            different_region)
        return shard_range_success and success

    def _fetch_and_merge_shard_ranges(self, http, broker):
        with Timeout(self.node_timeout):
            response = http.replicate('get_shard_ranges')
        if is_success(response.status):
            broker.merge_shard_ranges(json.loads(
                response.data.decode('ascii')))

    def find_local_handoff_for_part(self, part):
        """
        Look through devices in the ring for the first handoff device that was
        identified during job creation as available on this node.

        :returns: a node entry from the ring
        """
        nodes = self.ring.get_part_nodes(part)
        more_nodes = self.ring.get_more_nodes(part)

        for node in itertools.chain(nodes, more_nodes):
            if node['id'] in self._local_device_ids:
                return node
        return None

    def get_reconciler_broker(self, timestamp):
        """
        Get a local instance of the reconciler container broker that is
        appropriate to enqueue the given timestamp.

        :param timestamp: the timestamp of the row to be enqueued

        :returns: a local reconciler broker
        """
        container = get_reconciler_container_name(timestamp)
        if self.reconciler_containers and \
                container in self.reconciler_containers:
            return self.reconciler_containers[container][1]
        account = MISPLACED_OBJECTS_ACCOUNT
        part = self.ring.get_part(account, container)
        node = self.find_local_handoff_for_part(part)
        if not node:
            raise DeviceUnavailable(
                'No mounted devices found suitable to Handoff reconciler '
                'container %s in partition %s' % (container, part))
        broker = ContainerBroker.create_broker(
            os.path.join(self.root, node['device']), part, account, container,
            logger=self.logger, put_timestamp=timestamp,
            storage_policy_index=0)
        if self.reconciler_containers is not None:
            self.reconciler_containers[container] = part, broker, node['id']
        return broker

    def feed_reconciler(self, container, item_list):
        """
        Add queue entries for rows in item_list to the local reconciler
        container database.

        :param container: the name of the reconciler container
        :param item_list: the list of rows to enqueue

        :returns: True if successfully enqueued
        """

        try:
            reconciler = self.get_reconciler_broker(container)
        except DeviceUnavailable as e:
            self.logger.warning('DeviceUnavailable: %s', e)
            return False
        self.logger.debug('Adding %d objects to the reconciler at %s',
                          len(item_list), reconciler.db_file)
        try:
            reconciler.merge_items(item_list)
        except (Exception, Timeout):
            self.logger.exception('UNHANDLED EXCEPTION: trying to merge '
                                  '%d items to reconciler container %s',
                                  len(item_list), reconciler.db_file)
            return False
        return True

    def dump_to_reconciler(self, broker, point):
        """
        Look for object rows for objects updates in the wrong storage policy
        in broker with a ``ROWID`` greater than the rowid given as point.

        :param broker: the container broker with misplaced objects
        :param point: the last verified ``reconciler_sync_point``

        :returns: the last successful enqueued rowid
        """
        max_sync = broker.get_max_row()
        misplaced = broker.get_misplaced_since(point, self.per_diff)
        if not misplaced:
            return max_sync
        translator = get_row_to_q_entry_translator(broker)
        errors = False
        low_sync = point
        while misplaced:
            batches = defaultdict(list)
            for item in misplaced:
                container = get_reconciler_container_name(item['created_at'])
                batches[container].append(translator(item))
            for container, item_list in batches.items():
                success = self.feed_reconciler(container, item_list)
                if not success:
                    errors = True
            point = misplaced[-1]['ROWID']
            if not errors:
                low_sync = point
            misplaced = broker.get_misplaced_since(point, self.per_diff)
        return low_sync

    def _post_replicate_hook(self, broker, info, responses):
        if info['account'] == MISPLACED_OBJECTS_ACCOUNT:
            return

        try:
            self.sync_store.update_sync_store(broker)
        except Exception:
            self.logger.exception('Failed to update sync_store %s' %
                                  broker.db_file)

        point = broker.get_reconciler_sync()
        if not broker.has_multiple_policies() and info['max_row'] != point:
            broker.update_reconciler_sync(info['max_row'])
            return
        max_sync = self.dump_to_reconciler(broker, point)
        success = responses.count(True) >= majority_size(len(responses))
        if max_sync > point and success:
            # to be safe, only slide up the sync point with a majority on
            # replication
            broker.update_reconciler_sync(max_sync)

    def cleanup_post_replicate(self, broker, orig_info, responses):
        if broker.sharding_required():
            # despite being a handoff, since we're sharding we're not going to
            # do any cleanup so we can continue cleaving - this is still
            # considered "success"
            self.logger.debug(
                'Not deleting db %s (requires sharding, state %s)',
                broker.db_file, broker.get_db_state())
            return True
        return super(ContainerReplicator, self).cleanup_post_replicate(
            broker, orig_info, responses)

    def delete_db(self, broker):
        """
        Ensure that reconciler databases are only cleaned up at the end of the
        replication run.
        """
        if (self.reconciler_cleanups is not None and
                broker.account == MISPLACED_OBJECTS_ACCOUNT):
            # this container shouldn't be here, make sure it's cleaned up
            self.reconciler_cleanups[broker.container] = broker
            return
        if self.sync_store:
            try:
                # DB is going to get deleted. Be preemptive about it
                self.sync_store.remove_synced_container(broker)
            except Exception:
                self.logger.exception('Failed to remove sync_store entry %s' %
                                      broker.db_file)

        return super(ContainerReplicator, self).delete_db(broker)

    def replicate_reconcilers(self):
        """
        Ensure any items merged to reconciler containers during replication
        are pushed out to correct nodes and any reconciler containers that do
        not belong on this node are removed.
        """
        self.logger.info('Replicating %d reconciler containers',
                         len(self.reconciler_containers))
        for part, reconciler, node_id in self.reconciler_containers.values():
            self.cpool.spawn_n(
                self._replicate_object, part, reconciler.db_file, node_id)
        self.cpool.waitall()
        # wipe out the cache do disable bypass in delete_db
        cleanups = self.reconciler_cleanups
        self.reconciler_cleanups = self.reconciler_containers = None
        self.logger.info('Cleaning up %d reconciler containers',
                         len(cleanups))
        for reconciler in cleanups.values():
            self.cpool.spawn_n(self.delete_db, reconciler)
        self.cpool.waitall()
        self.logger.info('Finished reconciler replication')

    def run_once(self, *args, **kwargs):
        self.reconciler_containers = {}
        self.reconciler_cleanups = {}
        self.sync_store = ContainerSyncStore(self.root,
                                             self.logger,
                                             self.mount_check)
        rv = super(ContainerReplicator, self).run_once(*args, **kwargs)
        if any([self.reconciler_containers, self.reconciler_cleanups]):
            self.replicate_reconcilers()
        return rv


class ContainerReplicatorRpc(db_replicator.ReplicatorRpc):

    def _db_file_exists(self, db_path):
        return bool(get_db_files(db_path))

    def _parse_sync_args(self, args):
        parent = super(ContainerReplicatorRpc, self)
        remote_info = parent._parse_sync_args(args)
        if len(args) > 9:
            remote_info['status_changed_at'] = args[7]
            remote_info['count'] = args[8]
            remote_info['storage_policy_index'] = args[9]
        return remote_info

    def _get_synced_replication_info(self, broker, remote_info):
        """
        Sync the remote_info storage_policy_index if needed and return the
        newly synced replication info.

        :param broker: the database broker
        :param remote_info: the remote replication info

        :returns: local broker replication info
        """
        info = broker.get_replication_info()
        if incorrect_policy_index(info, remote_info):
            status_changed_at = Timestamp.now().internal
            broker.set_storage_policy_index(
                remote_info['storage_policy_index'],
                timestamp=status_changed_at)
            info = broker.get_replication_info()
        return info

    def _abort_rsync_then_merge(self, db_file, old_filename):
        if super(ContainerReplicatorRpc, self)._abort_rsync_then_merge(
                db_file, old_filename):
            return True
        # if the local db has started sharding since the original 'sync'
        # request then abort object replication now; instantiate a fresh broker
        # each time this check if performed so to get latest state
        broker = ContainerBroker(db_file)
        return broker.sharding_initiated()

    def _post_rsync_then_merge_hook(self, existing_broker, new_broker):
        # Note the following hook will need to change to using a pointer and
        # limit in the future.
        new_broker.merge_shard_ranges(
            existing_broker.get_all_shard_range_data())

    def merge_shard_ranges(self, broker, args):
        broker.merge_shard_ranges(args[0])
        return HTTPAccepted()

    def get_shard_ranges(self, broker, args):
        return HTTPOk(headers={'Content-Type': 'application/json'},
                      body=json.dumps(broker.get_all_shard_range_data()))