summaryrefslogtreecommitdiff
path: root/ovsdb/storage.c
diff options
context:
space:
mode:
authorIlya Maximets <i.maximets@ovn.org>2021-05-06 14:47:31 +0200
committerIlya Maximets <i.maximets@ovn.org>2021-05-14 16:00:22 +0200
commit3c2d6274bceecb65ec8f2f93f2aac26897a7ddfe (patch)
treeacd05ca86f831079bb122cbb1521685c2caaaf77 /ovsdb/storage.c
parentb5bb044fbe4c1395dcde5cc7d5081ef0099bb8b3 (diff)
downloadopenvswitch-3c2d6274bceecb65ec8f2f93f2aac26897a7ddfe.tar.gz
raft: Transfer leadership before creating snapshots.
With a big database writing snapshot could take a lot of time, for example, on one of the systems compaction of 300MB database takes about 10 seconds to complete. For the clustered database, 40% of this time takes conversion of the database to the file transaction json format, the rest of time is formatting a string and writing to disk. Of course, this highly depends on the disc and CPU speeds. 300MB is the very possible database size for the OVN Southbound DB, and it might be even bigger than that. During compaction the database is not available and the ovsdb-server doesn't do any other tasks. If leader spends 10-15 seconds writing a snapshot, the cluster is not functional for that time period. Leader also, likely, has some monitors to serve, so the one poll interval may be 15-20 seconds long in the end. Systems with so big databases typically has very high election timers configured (16 seconds), so followers will start election only after this significant amount of time. Once leader is back to the operational state, it will re-connect and try to join the cluster back. In some cases, this might also trigger the 'connected' state flapping on the old leader triggering a re-connection of clients. This issue has been observed with large-scale OVN deployments. One of the methods to improve the situation is to transfer leadership before compacting. This allows to keep the cluster functional, while one of the servers writes a snapshot. Additionally logging the time spent for compaction if it was longer than 1 second. This adds a bit of visibility to 'unreasonably long poll interval's. Reported-at: https://bugzilla.redhat.com/1960391 Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Acked-by: Dumitru Ceara <dceara@redhat.com>
Diffstat (limited to 'ovsdb/storage.c')
-rw-r--r--ovsdb/storage.c24
1 files changed, 17 insertions, 7 deletions
diff --git a/ovsdb/storage.c b/ovsdb/storage.c
index f662e9056..40415fcf6 100644
--- a/ovsdb/storage.c
+++ b/ovsdb/storage.c
@@ -519,14 +519,11 @@ ovsdb_storage_should_snapshot(const struct ovsdb_storage *storage)
return false;
}
- /* If we can't snapshot right now, don't. */
- if (storage->raft && !raft_may_snapshot(storage->raft)) {
- return false;
- }
-
uint64_t log_len = (storage->raft
? raft_get_log_length(storage->raft)
: storage->n_read + storage->n_written);
+ bool snapshot_recommended = false;
+
if (now < storage->next_snapshot_max) {
/* Maximum snapshot time not yet reached. Take a snapshot if there
* have been at least 100 log entries and the log file size has
@@ -534,12 +531,25 @@ ovsdb_storage_should_snapshot(const struct ovsdb_storage *storage)
bool grew_lots = (storage->raft
? raft_grew_lots(storage->raft)
: ovsdb_log_grew_lots(storage->log));
- return log_len >= 100 && grew_lots;
+ snapshot_recommended = (log_len >= 100 && grew_lots);
} else {
/* We have reached the maximum snapshot time. Take a snapshot if
* there have been any log entries at all. */
- return log_len > 0;
+ snapshot_recommended = (log_len > 0);
}
+
+ if (!snapshot_recommended) {
+ return false;
+ }
+
+ /* If we can't snapshot right now, don't. */
+ if (storage->raft && !raft_may_snapshot(storage->raft)) {
+ /* Notifying the storage that it needs to make a snapshot soon. */
+ raft_notify_snapshot_recommended(storage->raft);
+ return false;
+ }
+
+ return true;
}
return false;