summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulia Kreger <juliaashleykreger@gmail.com>2021-04-27 10:22:42 -0700
committerJulia Kreger <juliaashleykreger@gmail.com>2021-05-24 16:36:02 +0000
commitffff76a682b37d7485fe4d261520d102ab3e5fb8 (patch)
treed2e44d3236b2c198aa33fccf92ce22f097f761af
parent97ceb7bd157538bdaddc6fc6f564b8f0b980cf98 (diff)
downloadironic-ffff76a682b37d7485fe4d261520d102ab3e5fb8.tar.gz
Add basic tools for benchmarking
Adds a horribly written, just hacked together little tool to help provide sizing insight into an ironic deployment's state and underlying performance. Key data: * Queries the list of node from a pure python interface level with the database and reports timeing for the list of nodes to be returned. This information helps convey how long a periodic hits the database just for the query. * Requests *all* nodes using the query pattern/structure of the nova resource tracker, and uses the marker to make any additional requsts. The data is parsed, and collected, and counts identified vendors, if any. * Collects basic data on conductors in terms of running, conductor groups as well as currently loaded drivers in the deployment. All of this information provides operational insight into *what* conditions exist within the deployment allowing developers to try and identify solutions based on the unique circumstances of larger deployments. Also adds a utility to generate and semi-randomize data to allow us to create a benchmark job in CI. Change-Id: Iae660aea82db8f1c4567ee2982595ccfdf434fe3
-rw-r--r--tools/benchmark/README13
-rw-r--r--tools/benchmark/do_not_run_create_benchmark_data.py99
-rw-r--r--tools/benchmark/generate-statistics.py195
3 files changed, 307 insertions, 0 deletions
diff --git a/tools/benchmark/README b/tools/benchmark/README
new file mode 100644
index 000000000..25590fe1e
--- /dev/null
+++ b/tools/benchmark/README
@@ -0,0 +1,13 @@
+This folder contains two files:
+
+* do_not_run_create_benchmark_data.py - This script will destroy your
+ ironic database. DO NOT RUN IT. You have been warned!
+ It is is intended to generate a semi-random database of node data
+ which can be used for benchmarks, instead of crafting a raw SQL file
+ representing a test model
+
+* generate-statistics.py - This is a utility some statistics to both
+ aid in basic benchmarking of ironic operations *and* provide developers
+ with conceptual information regarding a deployment's size. It operates
+ only by reading the data present and timing how long the result take to
+ return as well as isolating some key details about the deployment.
diff --git a/tools/benchmark/do_not_run_create_benchmark_data.py b/tools/benchmark/do_not_run_create_benchmark_data.py
new file mode 100644
index 000000000..1e050a0f1
--- /dev/null
+++ b/tools/benchmark/do_not_run_create_benchmark_data.py
@@ -0,0 +1,99 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import sys
+import time
+
+from oslo_db.sqlalchemy import enginefacade
+from sqlalchemy import sql
+
+from ironic.common import service
+from ironic.conf import CONF # noqa To Load Configuration
+from ironic.objects import node
+
+
+def _create_test_nodes():
+ print("Starting creation of fake nodes.")
+ start = time.time()
+ node_count = 10000
+ checkin = time.time()
+ for i in range(0, node_count):
+
+ new_node = node.Node({
+ 'power_state': 'power off',
+ 'driver': 'ipmi',
+ 'driver_internal_info': {'test-meow': i},
+ 'name': 'BenchmarkTestNode-%s' % i,
+ 'driver_info': {
+ 'ipmi_username': 'admin',
+ 'ipmi_password': 'admin',
+ 'ipmi_address': 'testhost%s.env.top.level.domain' % i},
+ 'resource_class': 'CUSTOM_BAREMETAL',
+ 'properties': {
+ 'cpu': 4,
+ 'memory': 32,
+ 'cats': i,
+ 'meowing': True}})
+ new_node.create()
+ delta = time.time() - checkin
+ if delta > 10:
+ checkin = time.time()
+ print('* At %s nodes, %0.02f seconds. Total elapsed: %s'
+ % (i, delta, time.time() - start))
+ created = time.time()
+ elapse = created - start
+ print('Created %s nodes in %s seconds.\n' % (node_count, elapse))
+
+
+def _mix_up_nodes_data():
+ engine = enginefacade.writer.get_engine()
+ conn = engine.connect()
+
+ # A list of commands to mix up indexed field data a bit to emulate what
+ # a production database may somewhat look like.
+ commands = [
+ "UPDATE nodes set maintenance = True where RAND() < 0.1", # noqa Easier to read this way
+ "UPDATE nodes set driver = 'redfish' where RAND() < 0.5", # noqa Easier to read this way
+ "UPDATE nodes set reservation = 'fake_conductor01' where RAND() < 0.02", # noqa Easier to read this way
+ "UPDATE nodes set reservation = 'fake_conductor02' where RAND() < 0.02", # noqa Easier to read this way
+ "UPDATE nodes set reservation = 'fake_conductor03' where RAND() < 0.02", # noqa Easier to read this way
+ "UPDATE nodes set reservation = 'fake_conductor04' where RAND() < 0.02", # noqa Easier to read this way
+ "UPDATE nodes set reservation = 'fake_conductor05' where RAND() < 0.02", # noqa Easier to read this way
+ "UPDATE nodes set reservation = 'fake_conductor06' where RAND() < 0.02", # noqa Easier to read this way
+ "UPDATE nodes set provision_state = 'active' where RAND() < 0.8", # noqa Easier to read this way
+ "UPDATE nodes set power_state = 'power on' where provision_state = 'active' and RAND() < 0.95", # noqa Easier to read this way
+ "UPDATE nodes set provision_state = 'available' where RAND() < 0.1", # noqa Easier to read this way
+ "UPDATE nodes set provision_state = 'manageable' where RAND() < 0.1", # noqa Easier to read this way
+ "UPDATE nodes set provision_state = 'clean wait' where RAND() < 0.05", # noqa Easier to read this way
+ "UPDATE nodes set provision_state = 'error' where RAND() < 0.05", # noqa Easier to read this way
+ "UPDATE nodes set owner = (select UUID()) where RAND() < 0.2", # noqa Easier to read this way
+ "UPDATE nodes set lessee = (select UUID()) where RAND() < 0.2", # noqa Easier to read this way
+ "UPDATE nodes set instance_uuid = (select UUID()) where RAND() < 0.95 and provision_state = 'active'", # noqa Easier to read this way
+ "UPDATE nodes set last_error = (select UUID()) where RAND() <0.05", # noqa Easier to read this way
+ ]
+ start = time.time()
+ for command in commands:
+ print("Executing SQL command: \\" + command + ";\n")
+ conn.execute(sql.text(command))
+ print("* Completed command. %0.04f elapsed since start of commands."
+ % (time.time() - start))
+
+
+def main():
+ service.prepare_service()
+ CONF.set_override('debug', False)
+ _create_test_nodes()
+
+
+if __name__ == '__main__':
+ sys.exit(main())
diff --git a/tools/benchmark/generate-statistics.py b/tools/benchmark/generate-statistics.py
new file mode 100644
index 000000000..65e8d664f
--- /dev/null
+++ b/tools/benchmark/generate-statistics.py
@@ -0,0 +1,195 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import sys
+import time
+from unittest import mock
+
+from ironic_lib import metrics_utils
+import oslo_policy
+from oslo_utils import timeutils
+
+from ironic.api.controllers.v1 import node as node_api
+from ironic.api.controllers.v1 import utils as api_utils
+from ironic.common import context
+from ironic.common import service
+from ironic.conf import CONF # noqa To Load Configuration
+from ironic.db import api as db_api
+from ironic.objects import conductor
+from ironic.objects import node
+
+
+def _calculate_delta(start, finish):
+ return finish - start
+
+
+def _add_a_line():
+ print('------------------------------------------------------------')
+
+
+def _assess_db_performance():
+ start = time.time()
+ dbapi = db_api.get_instance()
+ print('Phase - Assess DB performance')
+ _add_a_line()
+ got_connection = time.time()
+ nodes = dbapi.get_node_list()
+ node_count = len(nodes)
+ query_complete = time.time()
+ delta = _calculate_delta(start, got_connection)
+ print('Obtained DB client in %s seconds.' % delta)
+ delta = _calculate_delta(got_connection, query_complete)
+ print('Returned %s nodes in python %s seconds from the DB.\n' %
+ (node_count, delta))
+ # return node count for future use.
+ return node_count
+
+
+def _assess_db_and_object_performance():
+ print('Phase - Assess DB & Object conversion Performance')
+ _add_a_line()
+ start = time.time()
+ node_list = node.Node().list(context.get_admin_context())
+ got_list = time.time()
+ delta = _calculate_delta(start, got_list)
+ print('Obtained list of node objects in %s seconds.' % delta)
+ count = 0
+ tbl_size = 0
+ # In a sense, this helps provide a relative understanding if the
+ # database is the bottleneck, or the objects post conversion.
+ # converting completely to json and then measuring the size helps
+ # ensure that everything is "assessed" while not revealing too
+ # much detail.
+ for node_obj in node_list:
+ # Just looping through the entire set to count should be
+ # enough to ensure that the entry is loaded from the db
+ # and then converted to an object.
+ tbl_size = tbl_size + sys.getsizeof(node_obj.as_dict(secure=True))
+ count = count + 1
+ delta = _calculate_delta(got_list, time.time())
+ print('Took %s seconds to iterate through %s node objects.' %
+ (delta, count))
+ print('Nodes table is roughly %s bytes of JSON.\n' % tbl_size)
+ observed_vendors = []
+ for node_obj in node_list:
+ vendor = node_obj.driver_internal_info.get('vendor')
+ if vendor:
+ observed_vendors.append(vendor)
+
+
+@mock.patch('ironic.api.request') # noqa patch needed for the object model
+@mock.patch.object(metrics_utils, 'get_metrics_logger', lambda *_: mock.Mock)
+@mock.patch.object(api_utils, 'check_list_policy', lambda *_: None)
+@mock.patch.object(api_utils, 'check_allow_specify_fields', lambda *_: None)
+@mock.patch.object(api_utils, 'check_allowed_fields', lambda *_: None)
+@mock.patch.object(oslo_policy.policy, 'LOG', autospec=True)
+def _assess_db_object_and_api_performance(mock_log, mock_request):
+ print('Phase - Assess DB & Object conversion Performance')
+ _add_a_line()
+ # Just mock it to silence it since getting the logger to update
+ # config seems like not a thing once started. :\
+ mock_log.debug = mock.Mock()
+ # Internal logic requires major/minor versions and a context to
+ # proceed. This is just to make the NodesController respond properly.
+ mock_request.context = context.get_admin_context()
+ mock_request.version.major = 1
+ mock_request.version.minor = 71
+
+ start = time.time()
+ node_api_controller = node_api.NodesController()
+ node_api_controller.context = context.get_admin_context()
+ fields = ("uuid,power_state,target_power_state,provision_state,"
+ "target_provision_state,last_error,maintenance,properties,"
+ "instance_uuid,traits,resource_class")
+
+ total_nodes = 0
+
+ res = node_api_controller._get_nodes_collection(
+ chassis_uuid=None,
+ instance_uuid=None,
+ associated=None,
+ maintenance=None,
+ retired=None,
+ provision_state=None,
+ marker=None,
+ limit=None,
+ sort_key="id",
+ sort_dir="asc",
+ fields=fields.split(','))
+ total_nodes = len(res['nodes'])
+ while len(res['nodes']) != 1:
+ print(" ** Getting nodes ** %s Elapsed: %s seconds." %
+ (total_nodes, _calculate_delta(start, time.time())))
+ res = node_api_controller._get_nodes_collection(
+ chassis_uuid=None,
+ instance_uuid=None,
+ associated=None,
+ maintenance=None,
+ retired=None,
+ provision_state=None,
+ marker=res['nodes'][-1]['uuid'],
+ limit=None,
+ sort_key="id",
+ sort_dir="asc",
+ fields=fields.split(','))
+ new_nodes = len(res['nodes'])
+ if new_nodes == 0:
+ break
+ total_nodes = total_nodes + new_nodes
+
+ delta = _calculate_delta(start, time.time())
+ print('Took %s seconds to return all %s nodes via '
+ 'nodes API call pattern.\n' % (delta, total_nodes))
+
+
+def _report_conductors():
+ print('Phase - identifying conductors/drivers')
+ _add_a_line()
+ conductors = conductor.Conductor().list(
+ context.get_admin_context(),
+ )
+ drivers = []
+ groups = []
+ online_count = 0
+ online_by = timeutils.utcnow(with_timezone=True) - \
+ datetime.timedelta(seconds=90)
+ for conductor_obj in conductors:
+ if conductor_obj.conductor_group:
+ groups.append(conductor_obj.conductor_group)
+ if conductor_obj.updated_at > online_by:
+ online_count = online_count + 1
+ for driver in conductor_obj.drivers:
+ drivers.append(driver)
+ conductor_count = len(conductors)
+ print('Conductor count: %s' % conductor_count)
+ print('Online conductor count: %s' % online_count)
+ running_with_groups = len(groups)
+ print('Conductors with conductor_groups: %s' % running_with_groups)
+ group_count = len(set(groups))
+ print('Conductor group count: %s' % group_count)
+ driver_list = list(set(drivers))
+ print('Presently supported drivers: %s' % driver_list)
+
+
+def main():
+ service.prepare_service()
+ CONF.set_override('debug', False)
+ _assess_db_performance()
+ _assess_db_and_object_performance()
+ _assess_db_object_and_api_performance()
+ _report_conductors()
+
+
+if __name__ == '__main__':
+ sys.exit(main())