summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJulia Kreger <juliaashleykreger@gmail.com>2023-04-27 16:18:14 -0700
committerJulia Kreger <juliaashleykreger@gmail.com>2023-05-01 15:35:33 -0700
commit75b881bd31952b8f858b4708fb83f7d2ca7c42cc (patch)
treecac2546a65a6b901bd0e283df5b735918b9a1d2d
parent5e6fa6ef3060af766231bafbd756401d4af9ee84 (diff)
downloadironic-75b881bd31952b8f858b4708fb83f7d2ca7c42cc.tar.gz
Fix DB/Lock session handling issues
Prior to this fix, we have been unable to run the Metal3 CI job with SQLAlchemy's internal autocommit setting enabled. However that setting is deprecated and needs to be removed. Investigating our DB queries and request patterns, we were able to identify some queries which generally resulted in the underlying task and lock being held longer because the output was not actually returned, which is something we've generally had to fix in some places previously. Doing some of these changes did drastically reduce the number of errors encountered with the Metal3 CI job, however it did not eliminate them entirely. Further investigation, we were able to determine that the underlying issue we were encountering was when we had an external semi-random reader, such as Metal3 polling endpoints, we could reach a situation where we would be blocked from updating the database as to open a write lock, we need the active readers not to be interacting with the database, and with a random reader of sorts, the only realistic option we have is to enable the Write Ahead Log[0]. We didn't have to do this with SQLAlchemy previously because autocommit behavior hid the complexities from us, but in order to move to SQLAlchemy 2.0, we do need to remove autocommit. Additionally, adds two unit tests for get_node_with_token rpc method, which apparently we missed or lost somewhere along the way. Also, adds notes to two Database interactions to suggest we look at them in the future as they may not be the most efficient path forward. [0]: https://www.sqlite.org/wal.html Change-Id: Iebcc15fe202910b942b58fc004d077740ec61912
-rw-r--r--ironic/conductor/manager.py17
-rw-r--r--ironic/conductor/utils.py1
-rw-r--r--ironic/conf/opts.py1
-rw-r--r--ironic/db/sqlalchemy/__init__.py19
-rw-r--r--ironic/db/sqlalchemy/api.py34
-rw-r--r--ironic/tests/unit/conductor/test_manager.py29
-rw-r--r--releasenotes/notes/fix-db-sqlite-OperationalError-7934dbda2a21c69e.yaml6
7 files changed, 78 insertions, 29 deletions
diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py
index bbd2355bd..ef850753d 100644
--- a/ironic/conductor/manager.py
+++ b/ironic/conductor/manager.py
@@ -2284,10 +2284,13 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.debug("RPC create_port called for port %s.", port_uuid)
with task_manager.acquire(context, port_obj.node_id,
- purpose='port create') as task:
+ purpose='port create',
+ shared=True) as task:
+ # NOTE(TheJulia): We're creating a port, we don't need
+ # an exclusive parent lock to do so.
utils.validate_port_physnet(task, port_obj)
port_obj.create()
- return port_obj
+ return port_obj
@METRICS.timer('ConductorManager.update_port')
@messaging.expected_exceptions(exception.NodeLocked,
@@ -2373,7 +2376,7 @@ class ConductorManager(base_manager.BaseConductorManager):
port_obj.save()
- return port_obj
+ return port_obj
@METRICS.timer('ConductorManager.update_portgroup')
@messaging.expected_exceptions(exception.NodeLocked,
@@ -2452,7 +2455,7 @@ class ConductorManager(base_manager.BaseConductorManager):
portgroup_obj.save()
- return portgroup_obj
+ return portgroup_obj
@METRICS.timer('ConductorManager.update_volume_connector')
@messaging.expected_exceptions(
@@ -2496,7 +2499,7 @@ class ConductorManager(base_manager.BaseConductorManager):
connector.save()
LOG.info("Successfully updated volume connector %(connector)s.",
{'connector': connector.uuid})
- return connector
+ return connector
@METRICS.timer('ConductorManager.update_volume_target')
@messaging.expected_exceptions(
@@ -2537,7 +2540,7 @@ class ConductorManager(base_manager.BaseConductorManager):
target.save()
LOG.info("Successfully updated volume target %(target)s.",
{'target': target.uuid})
- return target
+ return target
@METRICS.timer('ConductorManager.get_driver_properties')
@messaging.expected_exceptions(exception.DriverNotFound)
@@ -3564,7 +3567,7 @@ class ConductorManager(base_manager.BaseConductorManager):
{'node': task.node.uuid})
utils.add_secret_token(task.node)
task.node.save()
- return task.node
+ return objects.Node.get(context, node_id)
@METRICS.timer('ConductorManager.manage_node_history')
@periodics.periodic(
diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py
index add9ee74d..39886bfd8 100644
--- a/ironic/conductor/utils.py
+++ b/ironic/conductor/utils.py
@@ -808,7 +808,6 @@ def power_state_error_handler(e, node, power_state):
{'node': node.uuid, 'power_state': power_state})
-@task_manager.require_exclusive_lock
def validate_port_physnet(task, port_obj):
"""Validate the consistency of physical networks of ports in a portgroup.
diff --git a/ironic/conf/opts.py b/ironic/conf/opts.py
index a7ebcfb30..0d8aeafd2 100644
--- a/ironic/conf/opts.py
+++ b/ironic/conf/opts.py
@@ -78,7 +78,6 @@ def update_opt_defaults():
# This comes in two flavors
'oslo.messaging=INFO',
'oslo_messaging=INFO',
- 'sqlalchemy=WARNING',
'stevedore=INFO',
'eventlet.wsgi.server=INFO',
'iso8601=WARNING',
diff --git a/ironic/db/sqlalchemy/__init__.py b/ironic/db/sqlalchemy/__init__.py
index 173b91fcc..c656ed227 100644
--- a/ironic/db/sqlalchemy/__init__.py
+++ b/ironic/db/sqlalchemy/__init__.py
@@ -10,9 +10,28 @@
# License for the specific language governing permissions and limitations
# under the License.
+from oslo_config import cfg
from oslo_db.sqlalchemy import enginefacade
+from sqlalchemy.engine import Engine
+from sqlalchemy import event
+
+CONF = cfg.CONF
# FIXME(stephenfin): we need to remove reliance on autocommit semantics ASAP
# since it's not compatible with SQLAlchemy 2.0
# NOTE(dtantsur): we want sqlite as close to a real database as possible.
enginefacade.configure(sqlite_fk=True, __autocommit=True)
+
+
+# NOTE(TheJulia): Setup a listener to trigger the sqlite write-ahead
+# log to be utilized to permit concurrent access, which is needed
+# as we can get read requests while we are writing via the API
+# surface *when* we're using sqlite as the database backend.
+@event.listens_for(Engine, "connect")
+def _setup_journal_mode(dbapi_connection, connection_record):
+ # NOTE(TheJulia): The string may not be loaded in some unit
+ # tests so handle whatever the output is as a string so we
+ # can lower/compare it and send the appropriate command to
+ # the database.
+ if 'sqlite' in str(CONF.database.connection).lower():
+ dbapi_connection.execute("PRAGMA journal_mode=WAL")
diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py
index 31ec9647e..3ba31e233 100644
--- a/ironic/db/sqlalchemy/api.py
+++ b/ironic/db/sqlalchemy/api.py
@@ -324,7 +324,9 @@ def _paginate_query(model, limit=None, marker=None, sort_key=None,
# object is garbage collected as ORM Query objects allow
# for DB interactions to occur after the fact, so it remains
# connected to the DB..
- return query.all()
+ # Save the query.all() results, but don't return yet, so we
+ # begin to exit and unwind the session.
+ ref = query.all()
else:
# In this case, we have a sqlalchemy.sql.selectable.Select
# (most likely) which utilizes the unified select interface.
@@ -338,7 +340,10 @@ def _paginate_query(model, limit=None, marker=None, sort_key=None,
# Everything is a tuple in a resultset from the unified interface
# but for objects, our model expects just object access,
# so we extract and return them.
- return [r[0] for r in res]
+ ref = [r[0] for r in res]
+ # Return the results to the caller, outside of the session context
+ # if an ORM object, because we want the session to close.
+ return ref
def _filter_active_conductors(query, interval=None):
@@ -1341,6 +1346,12 @@ class Connection(api.Connection):
def get_active_hardware_type_dict(self, use_groups=False):
with _session_for_read() as session:
+ # TODO(TheJulia): We should likely take a look at this
+ # joined query, as we may not be getting what we expect.
+ # Metal3 logs upwards of 200 rows returned with multiple datetime
+ # columns.
+ # Given dualing datetime fields, we really can't just expect
+ # requesting a unique set to "just work".
query = (session.query(models.ConductorHardwareInterfaces,
models.Conductor)
.join(models.Conductor))
@@ -1375,7 +1386,8 @@ class Connection(api.Connection):
with _session_for_read() as session:
query = (session.query(models.ConductorHardwareInterfaces)
.filter_by(conductor_id=conductor_id))
- return query.all()
+ ref = query.all()
+ return ref
def list_hardware_type_interfaces(self, hardware_types):
with _session_for_read() as session:
@@ -1397,6 +1409,8 @@ class Connection(api.Connection):
conductor_hw_iface['conductor_id'] = conductor_id
for k, v in iface.items():
conductor_hw_iface[k] = v
+ # TODO(TheJulia): Uhh... We should try to do this as one
+ # bulk operation and not insert each row.
session.add(conductor_hw_iface)
session.flush()
except db_exc.DBDuplicateEntry as e:
@@ -2080,9 +2094,10 @@ class Connection(api.Connection):
query = session.query(models.Allocation).filter_by(
id=allocation_id)
try:
- return query.one()
+ ref = query.one()
except NoResultFound:
raise exception.AllocationNotFound(allocation=allocation_id)
+ return ref
def get_allocation_by_uuid(self, allocation_uuid):
"""Return an allocation representation.
@@ -2095,9 +2110,10 @@ class Connection(api.Connection):
query = session.query(models.Allocation).filter_by(
uuid=allocation_uuid)
try:
- return query.one()
+ ref = query.one()
except NoResultFound:
raise exception.AllocationNotFound(allocation=allocation_uuid)
+ return ref
def get_allocation_by_name(self, name):
"""Return an allocation representation.
@@ -2109,9 +2125,10 @@ class Connection(api.Connection):
with _session_for_read() as session:
query = session.query(models.Allocation).filter_by(name=name)
try:
- return query.one()
+ ref = query.one()
except NoResultFound:
raise exception.AllocationNotFound(allocation=name)
+ return ref
def get_allocation_list(self, filters=None, limit=None, marker=None,
sort_key=None, sort_dir=None):
@@ -2455,7 +2472,7 @@ class Connection(api.Connection):
session.flush()
except db_exc.DBDuplicateEntry:
raise exception.NodeHistoryAlreadyExists(uuid=values['uuid'])
- return history
+ return history
@oslo_db_api.retry_on_deadlock
def destroy_node_history_by_uuid(self, history_uuid):
@@ -2469,9 +2486,10 @@ class Connection(api.Connection):
def get_node_history_by_id(self, history_id):
query = model_query(models.NodeHistory).filter_by(id=history_id)
try:
- return query.one()
+ res = query.one()
except NoResultFound:
raise exception.NodeHistoryNotFound(history=history_id)
+ return res
def get_node_history_by_uuid(self, history_uuid):
query = model_query(models.NodeHistory).filter_by(uuid=history_uuid)
diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py
index 5244e4c4b..f0fb075b7 100644
--- a/ironic/tests/unit/conductor/test_manager.py
+++ b/ironic/tests/unit/conductor/test_manager.py
@@ -3568,6 +3568,23 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, mgr_utils.CommonMixIn,
filters=mock.sentinel.filters))
self.assertEqual([], result)
+ def test_get_node_with_token(self):
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ network_interface='noop')
+ self.assertNotIn('agent_secret_token', node.driver_internal_info)
+ res = self.service.get_node_with_token(self.context, node.id)
+ self.assertIn('agent_secret_token', res.driver_internal_info)
+
+ def test_node_with_token_already_set(self):
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ network_interface='noop',
+ driver_internal_info={'agent_secret_token': 'secret'})
+ res = self.service.get_node_with_token(self.context, node.id)
+ self.assertEqual('******',
+ res.driver_internal_info['agent_secret_token'])
+
@mgr_utils.mock_record_keepalive
class ConsoleTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@@ -3953,18 +3970,6 @@ class CreatePortTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertEqual({'foo': 'bar'}, res.extra)
mock_validate.assert_called_once_with(mock.ANY, port)
- def test_create_port_node_locked(self):
- node = obj_utils.create_test_node(self.context, driver='fake-hardware',
- reservation='fake-reserv')
- port = obj_utils.get_test_port(self.context, node_id=node.id)
- exc = self.assertRaises(messaging.rpc.ExpectedException,
- self.service.create_port,
- self.context, port)
- # Compare true exception hidden by @messaging.expected_exceptions
- self.assertEqual(exception.NodeLocked, exc.exc_info[0])
- self.assertRaises(exception.PortNotFound, port.get_by_uuid,
- self.context, port.uuid)
-
@mock.patch.object(conductor_utils, 'validate_port_physnet', autospec=True)
def test_create_port_mac_exists(self, mock_validate):
node = obj_utils.create_test_node(self.context, driver='fake-hardware')
diff --git a/releasenotes/notes/fix-db-sqlite-OperationalError-7934dbda2a21c69e.yaml b/releasenotes/notes/fix-db-sqlite-OperationalError-7934dbda2a21c69e.yaml
new file mode 100644
index 000000000..85a604ae4
--- /dev/null
+++ b/releasenotes/notes/fix-db-sqlite-OperationalError-7934dbda2a21c69e.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+ - |
+ Fixes issues in Ironic's use of SQLAlchemy with SQLite Databases,
+ which is common with users like Metal3, which prevented Ironic from
+ supporting SQLAlchemy 2.0 properly, as autocommit was re-enabled.