summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Schwede <christian.schwede@enovance.com>2014-09-14 20:48:32 +0000
committerChristian Schwede <christian.schwede@enovance.com>2014-09-30 20:05:41 +0000
commit20e9ad538b6ba90674a75310d9cc184162ba9398 (patch)
treefec7871a0dc8e9f7be88e984113109fdfe013a2c
parent09bdc87cbc1e7bc1918f9b5094bec266b6761d75 (diff)
downloadswift-20e9ad538b6ba90674a75310d9cc184162ba9398.tar.gz
Limit partition movement when adding a new tier
When adding a new tier (region, zone, node, device) to an existing, already balanced ring all existing partitions in the existing tiers of the same level are gathered for reassigning, even when there is not enough space in the new tier. This will create a lot of unnecessary replication traffic in the backend network. For example, when only one region exists in the ring and a new region is added, all existing parts are selected to reassign, even when the new region has a total weight of 0. Same for zones, nodes and devices. This patch limits the number of partitions that are choosen to reassign by checking for devices on other tiers that are asking for more partitions. Failed devices are not considered when applying the limit. Co-Authored By: Florent Flament <florent.flament-ext@cloudwatt.com> Change-Id: I6178452e47492da4677a8ffe4fb24917b5968cd9 Closes-Bug: 1367826
-rw-r--r--swift/common/ring/builder.py31
-rw-r--r--test/unit/common/ring/test_builder.py34
2 files changed, 61 insertions, 4 deletions
diff --git a/swift/common/ring/builder.py b/swift/common/ring/builder.py
index f3096f6e0..225916ce8 100644
--- a/swift/common/ring/builder.py
+++ b/swift/common/ring/builder.py
@@ -643,6 +643,26 @@ class RingBuilder(object):
self._last_part_moves[part] = 0xff
self._last_part_moves_epoch = int(time())
+ def _get_available_parts(self):
+ """
+ Returns a tuple (wanted_parts_total, dict of (tier: available parts in
+ other tiers) for all tiers in the ring.
+
+ Devices that have too much partitions (negative parts_wanted) are
+ ignored, otherwise the sum of all parts_wanted is 0 +/- rounding
+ errors.
+
+ """
+ wanted_parts_total = 0
+ wanted_parts_for_tier = {}
+ for dev in self._iter_devs():
+ wanted_parts_total += max(0, dev['parts_wanted'])
+ for tier in tiers_for_dev(dev):
+ if tier not in wanted_parts_for_tier:
+ wanted_parts_for_tier[tier] = 0
+ wanted_parts_for_tier[tier] += max(0, dev['parts_wanted'])
+ return (wanted_parts_total, wanted_parts_for_tier)
+
def _gather_reassign_parts(self):
"""
Returns a list of (partition, replicas) pairs to be reassigned by
@@ -671,6 +691,9 @@ class RingBuilder(object):
# currently sufficient spread out across the cluster.
spread_out_parts = defaultdict(list)
max_allowed_replicas = self._build_max_replicas_by_tier()
+ wanted_parts_total, wanted_parts_for_tier = \
+ self._get_available_parts()
+ moved_parts = 0
for part in xrange(self.parts):
# Only move one replica at a time if possible.
if part in removed_dev_parts:
@@ -701,14 +724,20 @@ class RingBuilder(object):
rep_at_tier = 0
if tier in replicas_at_tier:
rep_at_tier = replicas_at_tier[tier]
+ # Only allowing parts to be gathered if
+ # there are wanted parts on other tiers
+ available_parts_for_tier = wanted_parts_total - \
+ wanted_parts_for_tier[tier] - moved_parts
if (rep_at_tier > max_allowed_replicas[tier] and
self._last_part_moves[part] >=
- self.min_part_hours):
+ self.min_part_hours and
+ available_parts_for_tier > 0):
self._last_part_moves[part] = 0
spread_out_parts[part].append(replica)
dev['parts_wanted'] += 1
dev['parts'] -= 1
removed_replica = True
+ moved_parts += 1
break
if removed_replica:
if dev['id'] not in tfd:
diff --git a/test/unit/common/ring/test_builder.py b/test/unit/common/ring/test_builder.py
index 6476b4cf8..3848725ba 100644
--- a/test/unit/common/ring/test_builder.py
+++ b/test/unit/common/ring/test_builder.py
@@ -19,6 +19,7 @@ import os
import unittest
import cPickle as pickle
from collections import defaultdict
+from math import ceil
from tempfile import mkdtemp
from shutil import rmtree
@@ -718,9 +719,7 @@ class TestRingBuilder(unittest.TestCase):
population_by_region = self._get_population_by_region(rb)
self.assertEquals(population_by_region, {0: 682, 1: 86})
- # Rebalancing will reassign 143 of the partitions, which is ~1/5
- # of the total amount of partitions (3*256)
- self.assertEqual(143, changed_parts)
+ self.assertEqual(87, changed_parts)
# and since there's not enough room, subsequent rebalances will not
# cause additional assignments to r1
@@ -744,6 +743,35 @@ class TestRingBuilder(unittest.TestCase):
population_by_region = self._get_population_by_region(rb)
self.assertEquals(population_by_region, {0: 512, 1: 256})
+ def test_avoid_tier_change_new_region(self):
+ rb = ring.RingBuilder(8, 3, 1)
+ for i in range(5):
+ rb.add_dev({'id': i, 'region': 0, 'zone': 0, 'weight': 100,
+ 'ip': '127.0.0.1', 'port': i, 'device': 'sda1'})
+ rb.rebalance(seed=2)
+
+ # Add a new device in new region to a balanced ring
+ rb.add_dev({'id': 5, 'region': 1, 'zone': 0, 'weight': 0,
+ 'ip': '127.0.0.5', 'port': 10000, 'device': 'sda1'})
+
+ # Increase the weight of region 1 slowly
+ moved_partitions = []
+ for weight in range(0, 101, 10):
+ rb.set_dev_weight(5, weight)
+ rb.pretend_min_part_hours_passed()
+ changed_parts, _balance = rb.rebalance(seed=2)
+ moved_partitions.append(changed_parts)
+ # Ensure that the second region has enough partitions
+ # Otherwise there will be replicas at risk
+ min_parts_for_r1 = ceil(weight / (500.0 + weight) * 768)
+ parts_for_r1 = self._get_population_by_region(rb).get(1, 0)
+ self.assertEqual(min_parts_for_r1, parts_for_r1)
+
+ # Number of partitions moved on each rebalance
+ # 10/510 * 768 ~ 15.06 -> move at least 15 partitions in first step
+ ref = [0, 17, 16, 16, 14, 15, 13, 13, 12, 12, 14]
+ self.assertEqual(ref, moved_partitions)
+
def test_set_replicas_increase(self):
rb = ring.RingBuilder(8, 2, 0)
rb.add_dev({'id': 0, 'region': 0, 'zone': 0, 'weight': 1,